<a href="https://colab.research.google.com/github/hshuai97/Colab20210803/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reference

Using BERT or RoBERTa model to classification on several datasets (20ng, r8, r52, oh, mr)

Related link:

1. [Huggingface](https://github.com/huggingface/transformers/blob/main/README_zh-hans.md)

2. [Colab1: Huggingface pytorch transformer](https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/huggingface_pytorch-transformers.ipynb)

3. [Colab2: Sentiment analysis using roberta](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb#scrollTo=HMqQTafXEaei)

4. [Text classification with BERT in PyTorch:](https://github.com/nlptown/nlp-notebooks/blob/master/Text%20classification%20with%20BERT%20in%20PyTorch.ipynb)

5. [Distilbert for multilabel text classification](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)

6. [transformers.get_linear_schedule_with_warmup](https://huggingface.co/docs/transformers/main_classes/optimizer_schedules?highlight=get_linear_schedule_with_warmup#transformers.get_linear_schedule_with_warmup)

# Install libraries

In [None]:
try:
  import transformers
except ModuleNotFoundError:
  !pip install transformers

try:
  import sentencepiece
except ModuleNotFoundError:
  !pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 22.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 48.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 6.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacre

# Parsing

In [None]:
%%writefile parsing.py
import os
import torch
import argparse
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup

def data(dataset_name):
    NAME = dataset_name
    if NAME not in ['20ng', 'r8', 'r52', 'oh', 'mr']:
      raise ValueError('The dataset is not support')

    PATH = '/content/drive/MyDrive/Colab_Notebooks/CODE/TextLevelGNN/data/'+NAME

    train_texts = []
    train_labels = []
    with open(os.path.join(PATH, NAME+'-train-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          train_texts.append(t[1])
          train_labels.append(t[0])

    dev_texts = []
    dev_labels = []
    with open(os.path.join(PATH, NAME+'-dev-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          dev_texts.append(t[1])
          dev_labels.append(t[0])

    test_texts = []
    test_labels = []
    with open(os.path.join(PATH, NAME+'-test-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          test_texts.append(t[1])
          test_labels.append(t[0])

    target_names = list(set(train_labels))
    label2idx = {label: idx for idx, label in enumerate(target_names)}

    print("Train size:", len(train_texts))
    print("Dev size:", len(dev_texts))
    print("Test size:", len(test_texts))
    print(f'labels: {label2idx}')

    return train_texts,  train_labels, dev_texts, dev_labels, test_texts, test_labels, label2idx

def batch(inputs, y, batch_size, shuffle=True):
  input_ids = inputs.input_ids
  attention_mask = inputs.attention_mask

  data = TensorDataset(input_ids, attention_mask, y)

  dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
  
  return dataloader

class BERTClass(torch.nn.Module):
    def __init__(self, num_class, MODEL):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(MODEL)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.4)
        self.classifier = torch.nn.Linear(768, num_class)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

def train(model, tr_inputs, dev_inputs, epoch, batch_size, learning_rate, data_name, model_name):
    num_train_steps = int(len(tr_inputs.dataset)/batch_size) * epoch
    num_warmup_steps = int(0.15 * num_train_steps)

    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-3, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps = num_train_steps)

    best_acc = 0.0
    no_improv = 0  # No improvement on dev set
    PATIENCE = 8  # Patience on dev set to finish training
    for e in range(epoch):
      improved = ''
      model.train()

      for s, ba in enumerate(tr_inputs):
        b = tuple(t.to(device) for t in ba)
        input_ids, attention_mask, y = b

        outputs = model(input_ids, attention_mask)
        loss = loss_func(outputs, y)

        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        

      if e % 2 == 0:
        val_acc = dev(model, dev_inputs)
        if val_acc > best_acc:
          best_acc = val_acc
          no_improve = 0
          improved = '*'
          torch.save(model, f'/content/drive/MyDrive/Colab_Notebooks/CODE/TextLevelGNN/model/{model_name}_{data_name}.pkl')
        else:
          no_improve += 2
        if no_improve >=PATIENCE:
          print('No improvement on development set. Early stop training.')
          break
        
        print(f'Epoch:{e}, train loss:{loss.item():.6f}, val acc: {val_acc:.4f}, {improved}')

    return model

def dev(model, dev_inputs):
  model.eval()

  total_pred = 0.0
  correct = 0.0

  for s, ba in enumerate(dev_inputs):
      b = tuple(t.to(device) for t in ba)
      input_ids, attention_mask, y = b

      with torch.no_grad():
        logits = model(input_ids, attention_mask)
        pred = torch.argmax(logits, dim=1)
        
        correct_pred = torch.sum(pred==y)
        correct += correct_pred
        total_pred += len(y)
  
  return torch.div(correct, total_pred)  # Acc on dev set

def test(model, te_inputs):
  model.eval()

  total_pred = 0.0
  correct = 0.0

  for s, ba in enumerate(te_inputs):
    b = tuple(t.to(device) for t in ba)
    input_ids, attention_mask, y = b

    with torch.no_grad():
      logits = model(input_ids, attention_mask)
      pred = torch.argmax(logits, dim=1)

      correct_pred = torch.sum(pred==y)
      correct += correct_pred
      total_pred += len(y)

  return torch.div(correct, total_pred)  # Test set acc.

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', required=True, type=str, default='mr',help='dataset name')
parser.add_argument('--max_len', required=True, type=int)
parser.add_argument('--epoch', required=True, type=int, default=50)
parser.add_argument('--model', required=True, default='bert-base-uncased',)

args = parser.parse_args()

SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)


MAX_LEN = args.max_len
EPOCH = args.epoch
BATCH_SIZE = 64
LEARNING_RATE = 1e-04

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL = args.model  # 'roberta-base',  'bert-base-uncased', 
tokenizer = AutoTokenizer.from_pretrained(MODEL)

t_texts, t_labels, d_texts, d_labels, te_texts, te_labels, label2idx = data(args.dataset)

tr_y = torch.tensor([label2idx[t] for t in t_labels])
d_y = torch.tensor([label2idx[t] for t in d_labels])
te_y = torch.tensor([label2idx[t] for t in te_labels])

train_inputs = tokenizer(t_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
dev_inputs= tokenizer(d_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
test_inputs = tokenizer(te_texts, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='pt')
print(train_inputs.keys())

inputs_tr= batch(train_inputs, tr_y, BATCH_SIZE, shuffle=True)  # Batching for training
inputs_dev = batch(dev_inputs, d_y, BATCH_SIZE, shuffle=False)
inputs_te = batch(test_inputs, te_y, BATCH_SIZE, shuffle=False)

model = BERTClass(len(label2idx), MODEL)
model.to(device)

final_model= train(model, inputs_tr, inputs_dev, epoch=EPOCH, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, data_name=args.dataset, model_name=MODEL)

res = test(final_model, inputs_te)
print(f'Test accuracy: {res.cpu().numpy():.4f}')

Overwriting parsing.py


# Run

'bert-base-uncased' or 'roberta-base'

In [None]:
!python parsing.py --dataset='oh' --max_len=100 --epoch=50 --model='roberta-base'

Train size: 3021
Dev size: 336
Test size: 4043
labels: {'C23': 0, 'C18': 1, 'C20': 2, 'C17': 3, 'C22': 4, 'C01': 5, 'C05': 6, 'C08': 7, 'C16': 8, 'C09': 9, 'C19': 10, 'C03': 11, 'C15': 12, 'C21': 13, 'C07': 14, 'C11': 15, 'C06': 16, 'C04': 17, 'C14': 18, 'C13': 19, 'C02': 20, 'C12': 21, 'C10': 22}
dict_keys(['input_ids', 'attention_mask'])
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model 

In [None]:
a = int(5/2)
a

2