<a href="https://colab.research.google.com/github/hshuai97/Colab20210803/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using BERT or RoBERTa model to classification several datasets (20ng, r8, r52, oh, mr)

Reference:

1. [Huggingface](https://github.com/huggingface/transformers/blob/main/README_zh-hans.md)

2. [Colab1: Huggingface pytorch transformer](https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/huggingface_pytorch-transformers.ipynb)

3. [Colab2: Sentiment analysis using roberta](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb#scrollTo=HMqQTafXEaei)

4. [Text classification with BERT in PyTorch:](https://github.com/nlptown/nlp-notebooks/blob/master/Text%20classification%20with%20BERT%20in%20PyTorch.ipynb)

5. [Distilbert for multilabel text classification](https://github.comDhavalTaunk08/NLP_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)

# Install libraries

In [1]:
try:
  import transformers
except ModuleNotFoundError:
  !pip install transformers

try:
  import sentencepiece
except ModuleNotFoundError:
  !pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 43.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 24.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacre

# Parsing

In [2]:
%%writefile parsing.py
import os
import torch
import argparse
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

def data(dataset_name):
    NAME = dataset_name
    if NAME not in ['20ng', 'r8', 'r52', 'oh', 'mr']:
      raise ValueError('The dataset is not support')

    PATH = '/content/drive/MyDrive/Colab_Notebooks/CODE/TextLevelGNN/data/'+NAME

    train_texts = []
    train_labels = []
    with open(os.path.join(PATH, NAME+'-train-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          train_texts.append(t[1])
          train_labels.append(t[0])

    dev_texts = []
    dev_labels = []
    with open(os.path.join(PATH, NAME+'-dev-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          dev_texts.append(t[1])
          dev_labels.append(t[0])

    test_texts = []
    test_labels = []
    with open(os.path.join(PATH, NAME+'-test-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          test_texts.append(t[1])
          test_labels.append(t[0])

    target_names = list(set(train_labels))
    label2idx = {label: idx for idx, label in enumerate(target_names)}

    print("Train size:", len(train_texts))
    print("Dev size:", len(dev_texts))
    print("Test size:", len(test_texts))
    print(f'labels: {label2idx}')

    return train_texts,  train_labels, dev_texts, dev_labels, test_texts, test_labels, label2idx

def batch(inputs, y, batch_size, shuffle=True):
  input_ids = inputs.input_ids
  attention_mask = inputs.attention_mask

  data = TensorDataset(input_ids, attention_mask, y)

  dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
  
  return dataloader

class BERTClass(torch.nn.Module):
    def __init__(self, num_class, MODEL):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(MODEL)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.4)
        self.classifier = torch.nn.Linear(768, num_class)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

def train(model, tr_inputs, dev_inputs, epoch):
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4, lr=LEARNING_RATE)

    best_acc = 0.0
    no_improv = 0  # No improvement on dev set
    PATIENCE = 8  # Patience on dev set to finish training
    for e in range(epoch):
      improved = ''
      model.train()

      for s, ba in enumerate(tr_inputs):
        b = tuple(t.to(device) for t in ba)
        input_ids, attention_mask, y = b

        outputs = model(input_ids, attention_mask)
        loss = loss_func(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      if e % 2 == 0:
        val_acc = dev(model, dev_inputs)
        if val_acc > best_acc:
          best_acc = val_acc
          no_improve = 0
          improved = '*'
          torch.save(model, f'/content/drive/MyDrive/Colab_Notebooks/CODE/TextLevelGNN/model/bert_model.pkl')
        else:
          no_improve += 2
        print(f'Epoch:{e}, train loss:{loss.item():.6f}, val acc: {val_acc:.4f}, {improved}')
        if no_improve >=PATIENCE:
          print('No improvement on development set. Early stop training.')
          break

    return model

def dev(model, dev_inputs):
  model.eval()

  total_pred = 0.0
  correct = 0.0

  for s, ba in enumerate(dev_inputs):
      b = tuple(t.to(device) for t in ba)
      input_ids, attention_mask, y = b

      with torch.no_grad():
        logits = model(input_ids, attention_mask)
        pred = torch.argmax(logits, dim=1)
        
        correct_pred = torch.sum(pred==y)
        correct += correct_pred
        total_pred += len(y)
  
  return torch.div(correct, total_pred)  # Acc on dev set

def test(model, te_inputs):
  model.eval()

  total_pred = 0.0
  correct = 0.0

  for s, ba in enumerate(te_inputs):
    b = tuple(t.to(device) for t in ba)
    input_ids, attention_mask, y = b

    with torch.no_grad():
      logits = model(input_ids, attention_mask)
      pred = torch.argmax(logits, dim=1)

      correct_pred = torch.sum(pred==y)
      correct += correct_pred
      total_pred += len(y)

  return torch.div(correct, total_pred)  # Test set acc.

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', required=True, type=str, default='mr',help='dataset name')
parser.add_argument('--max_len', required=True, type=int)
parser.add_argument('--epoch', required=True, type=int, default=50)

args = parser.parse_args()

SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)


MAX_LEN = args.max_len
EPOCH = args.epoch
BATCH_SIZE = 64
LEARNING_RATE = 1e-04

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL = 'bert-base-uncased'  # 'roberta-base',  'bert-base-uncased', 
tokenizer = AutoTokenizer.from_pretrained(MODEL)

t_texts, t_labels, d_texts, d_labels, te_texts, te_labels, label2idx = data(args.dataset)

tr_y = torch.tensor([label2idx[t] for t in t_labels])
d_y = torch.tensor([label2idx[t] for t in d_labels])
te_y = torch.tensor([label2idx[t] for t in te_labels])

train_inputs = tokenizer(t_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
dev_inputs= tokenizer(d_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
test_inputs = tokenizer(te_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
print(train_inputs.keys())

inputs_tr= batch(train_inputs, tr_y, BATCH_SIZE, shuffle=True)  # Batching for training
inputs_dev = batch(dev_inputs, d_y, BATCH_SIZE, shuffle=False)
inputs_te = batch(test_inputs, te_y, BATCH_SIZE, shuffle=False)

model = BERTClass(len(label2idx), MODEL)
model.to(device)

final_model= train(model, inputs_tr, inputs_dev, epoch=EPOCH)

res = test(final_model, inputs_te)
print(f'Test accuracy: {res.cpu().numpy():.4f}')

Writing parsing.py


# Run

In [None]:
!python parsing.py --dataset='oh' --max_len=135 --epoch=50

Downloading: 100% 28.0/28.0 [00:00<00:00, 14.4kB/s]
Downloading: 100% 570/570 [00:00<00:00, 460kB/s]
Downloading: 100% 226k/226k [00:00<00:00, 427kB/s]
Downloading: 100% 455k/455k [00:00<00:00, 516kB/s]
Train size: 3021
Dev size: 336
Test size: 4043
labels: {'C04': 0, 'C15': 1, 'C14': 2, 'C22': 3, 'C18': 4, 'C23': 5, 'C06': 6, 'C13': 7, 'C03': 8, 'C16': 9, 'C07': 10, 'C17': 11, 'C12': 12, 'C05': 13, 'C08': 14, 'C10': 15, 'C20': 16, 'C02': 17, 'C09': 18, 'C11': 19, 'C01': 20, 'C19': 21, 'C21': 22}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Downloading: 100% 420M/420M [00:09<00:00, 46.0MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.