## Imports and configs

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
from torch import cuda

config = {'model_name': 'google/bigbird-roberta-base',
          'max_length': 1024,
          'train_batch_size': 4,
          'valid_batch_size': 4,
          'epochs':5,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm': 10,
          'device': 'cuda' if cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'} # Added support for Apple Metal acceleration.

print(f"Training on {config['device']}")

Training on cuda


In [3]:
full_df = pd.read_csv('../dataset.csv')
full_df.head()

Unnamed: 0,id,content,labels
0,73DC1D49FAD5,eletoral college can be a very good thing caus...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...
1,D840AC3957E5,"STUDENT_NAME\n\nADDRESS_NAME\n\nFebruary 22, 2...",[ 0 0 0 0 0 0 0 3 4 4 4 4 4 4 4 ...
2,753E320B186B,In my opinion as a student: I don't agree at t...,[1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2...
3,C2ABDAC2BC2C,When it comes to at home learning and attendin...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...
4,B2DDBAAC084C,Y\n\nou can ask many different people for advi...,[ 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ...


## Labels

Because we have 15k+ essays, each with hundreds of labeled words, we store the labels as integers rather than strings to drastically decrease disk and memory usage (see `preprocessing.ipynb` for how this was done). Additionally, we would have needed to do this for training regardless to build the one-hot arrays. These can be easily converted to/from their original string labels with the dictionaries below.

In [4]:
id2label = {
  0:  'Unnanotated',
  1:  'B-Lead',
  2:  'I-Lead',
  3:  'B-Position',
  4:  'I-Position',
  5:  'B-Evidence',
  6:  'I-Evidence',
  7:  'B-Claim',
  8:  'I-Claim',
  9:  'B-Concluding_Statement',
  10: 'I-Concluding_Statement',
  11: 'B-Counterclaim',
  12: 'I-Counterclaim',
  13: 'B-Rebuttal',
  14: 'I-Rebuttal'
}

label2id = {
  'Unnanotated': 0,
  'B-Lead': 1,
  'I-Lead': 2,
  'B-Position': 3,
  'I-Position': 4,
  'B-Evidence': 5,
  'I-Evidence': 6,
  'B-Claim': 7,
  'I-Claim': 8,
  'B-Concluding_Statement': 9,
  'I-Concluding_Statement': 10,
  'B-Counterclaim': 11,
  'I-Counterclaim': 12,
  'B-Rebuttal': 13,
  'I-Rebuttal': 14
}

## Dataset

Since we're using the PyTorch backend, it is convenient to define the torch Dataset (and later the Dataloader), so that the model can easily intake the data without any dependencies on how the we chose to store the data.

In [5]:
class EssayDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, get_word_ids):
        self.len = len(df)
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_word_ids = get_word_ids # For validation.

  def __getitem__(self, index):
        essay_words = self.df.content[index].split() # Split essay by words before tokenizing.

        encoding = self.tokenizer(essay_words,
                             is_split_into_words=True, # Necessary to keep correspondance between words and labels contructed previously.
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)
        """
        From Tokenizer's docs about word_ids:
            A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to None and other tokens
            are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word).

        This is needed to match the correct labels with the tokens, which may not have a 1:1 correspondence with the original words.
        """
        word_ids = encoding.word_ids()
        word_labels = None

        # If we're training, we want to know the labels corresponding to each word_id.
        if not self.get_word_ids:
            # Get original word label array for this essay.
            word_labels = [int(label) for label in self.df.labels[index].split()]
            label_ids = []

            # Correct for tokenization mismatch.
            for word_idx in word_ids:
                # 'None' means that this is a special/reserved token, mark as -100 to be ignored later in training.
                if word_idx is None:
                    label_ids.append(-100) # Magic number, automatically ignored by CrossEntropyLoss.
                else:
                    label_ids.append(word_labels[word_idx])

            encoding['labels'] = label_ids

        # Otherwise, it does not matter since we are predicting the labels, and we jus need to know the token-id correspondence for label attribution.
        else:
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            encoding['word_ids'] = torch.as_tensor(word_ids2)

        item = {k: torch.as_tensor(v) for k, v in encoding.items()}

        # if self.get_word_ids:
        #     word_ids2 = [w if w is not None else -1 for w in word_ids]
        #     item['word_ids'] = torch.as_tensor(word_ids2)

        return item

  def __len__(self):
        return self.len

### Create training and validation datasets + DataLoaders

We will use a 85%/15% split between training and validation sets.

In [6]:
validation_split_size = 0.15
dataset_split_seed = 33 # Any seed, here for inter-run consistency.

#### Split dataset into training and validation

In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(full_df, test_size=validation_split_size, random_state=dataset_split_seed)

# Drop id column from train dataframe
train_df = train_df[['content', 'labels']]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("train_df shape: ", train_df.shape)
print("val_df shape: ", val_df.shape)

train_df shape:  (13254, 2)
val_df shape:  (2340, 3)


#### Download/cache all models necessary

In [15]:
model_save_path = './model/'

if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)

model_name = 'google/bigbird-roberta-base' # From Huggingface's ModelHub.

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True, id2label=id2label)
tokenizer.save_pretrained(model_save_path)

config_model = AutoConfig.from_pretrained(model_name) 
config_model.num_labels = len(label2id)
config_model.save_pretrained(model_save_path)

backbone = AutoModelForTokenClassification.from_pretrained(model_name, 
                                                           config=config_model)
backbone.save_pretrained(model_save_path)

Some weights of BigBirdForTokenClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Initialize Dataset and tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained('./model')
training_set = EssayDataset(df=train_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=False)
testing_set = EssayDataset(df=val_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=True)

In [11]:
train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 2,
                }

validation_params = {'batch_size': config['valid_batch_size'],
               'shuffle': False,
               'num_workers': 2,
               }

#### Instantiate Dataloader (from torch)

In [12]:
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(testing_set, **validation_params)

Save DataLoaders to be able to quickly load them later.

In [13]:
# torch.save(training_loader, 'training_loader.pth')
# torch.save(validation_loader, 'validation_loader.pth')