## Imports and configs

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
config = {'model_name': 'google/bigbird-roberta-base', # From Huggingface's ModelHub.
          'model_save_path': './model/',
          'model_chkpt_path': './model/model_chkpt/',
          'max_length': 1024,
          'train_batch_size': 4,
          'valid_batch_size': 4,
          'epochs':5,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm': 10,
          'device': 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'} # Added support for Apple Metal acceleration.

print(f"Training on {config['device']}")

Training on cuda


In [3]:
full_df = pd.read_pickle('../dataset.csv')
full_df.head()

Unnamed: 0_level_0,content,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1
73DC1D49FAD5,eletoral college can be a very good thing caus...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
D840AC3957E5,"STUDENT_NAME\n\nADDRESS_NAME\n\nFebruary 22, 2...","[0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, ..."
753E320B186B,In my opinion as a student: I don't agree at t...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
C2ABDAC2BC2C,When it comes to at home learning and attendin...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
B2DDBAAC084C,Y\n\nou can ask many different people for advi...,"[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


## Labels

Because we have 15k+ essays, each with hundreds of labeled words, we store the labels as integers rather than strings to drastically decrease disk and memory usage (see `preprocessing.ipynb` for how this was done). Additionally, we would have needed to do this for training regardless to build the one-hot arrays. These can be easily converted to/from their original string labels with the dictionaries below.

In [4]:
id2label = {
  0:  'Unnanotated',
  1:  'B-Lead',
  2:  'I-Lead',
  3:  'B-Position',
  4:  'I-Position',
  5:  'B-Evidence',
  6:  'I-Evidence',
  7:  'B-Claim',
  8:  'I-Claim',
  9:  'B-Concluding_Statement',
  10: 'I-Concluding_Statement',
  11: 'B-Counterclaim',
  12: 'I-Counterclaim',
  13: 'B-Rebuttal',
  14: 'I-Rebuttal'
}

label2id = {
  'Unnanotated': 0,
  'B-Lead': 1,
  'I-Lead': 2,
  'B-Position': 3,
  'I-Position': 4,
  'B-Evidence': 5,
  'I-Evidence': 6,
  'B-Claim': 7,
  'I-Claim': 8,
  'B-Concluding_Statement': 9,
  'I-Concluding_Statement': 10,
  'B-Counterclaim': 11,
  'I-Counterclaim': 12,
  'B-Rebuttal': 13,
  'I-Rebuttal': 14
}

## Dataset

Since we're using the PyTorch backend, it is convenient to define the torch Dataset (and later the Dataloader), so that the model can easily intake the data without any dependencies on how the we chose to store the data.

In [5]:
class EssayDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, get_word_ids):
        self.len = len(df)
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_word_ids = get_word_ids # For validation.

  def __getitem__(self, index):
        essay_words = self.df.content[index].split() # Split essay by words before tokenizing.

        # Makes a dict with keys: input_ids, attention_mask.
        encoding = self.tokenizer(essay_words,
                             is_split_into_words=True, # Necessary to keep correspondance between words and labels contructed previously.
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)
        """
        From Tokenizer's docs about word_ids:
            A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to None and other tokens
            are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word).

        This is needed to match the correct labels with the tokens, which may not have a 1:1 correspondence with the original words.
        """
        word_ids = encoding.word_ids()
        word_labels = None

        # If we're training, we want to know the labels corresponding to each word_id.
        if not self.get_word_ids:
            # Get original word label array for this essay.
            word_labels = self.df.labels[index] #[int(label) for label in self.df.labels[index].split()]
            label_ids = []

            # Correct for tokenization mismatch.
            for word_idx in word_ids:
                # 'None' means that this is a special/reserved token, mark as -100 to be ignored later in training.
                if word_idx is None:
                    label_ids.append(-100) # Magic number, automatically ignored by CrossEntropyLoss.
                else:
                    label_ids.append(word_labels[word_idx])

            encoding['labels'] = label_ids

        # Otherwise, it does not matter since we are predicting the labels, and we jus need to know the token-id correspondence for label attribution.
        else:
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            encoding['word_ids'] = torch.as_tensor(word_ids2)

        item = {k: torch.as_tensor(v) for k, v in encoding.items()}

        # if self.get_word_ids:
        #     word_ids2 = [w if w is not None else -1 for w in word_ids]
        #     item['word_ids'] = torch.as_tensor(word_ids2)

        return item

  def __len__(self):
        return self.len

### Create training and validation datasets + DataLoaders

We will use a 85%/15% split between training and validation sets.

In [6]:
validation_split_size = 0.15
dataset_split_seed = 33 # Any seed, here for inter-run consistency.

#### Split dataset into training and validation

In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(full_df, test_size=validation_split_size, random_state=dataset_split_seed)

# Drop id column from train dataframe
train_df = train_df[['content', 'labels']]

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("train_df shape: ", train_df.shape)
print("val_df shape: ", val_df.shape)

train_df shape:  (13254, 2)
val_df shape:  (2340, 2)


#### Download/cache all models necessary

In [8]:
if not os.path.exists(config['model_save_path']):
    os.mkdir(config['model_save_path'])

AutoTokenizer.from_pretrained(config['model_name'], add_prefix_space=True, id2label=id2label).save_pretrained(config['model_save_path'])

config_model = AutoConfig.from_pretrained(config['model_name']) 
config_model.num_labels = len(label2id)
config_model.save_pretrained(config['model_save_path'])

AutoModelForTokenClassification.from_pretrained(config['model_name'], 
                                                           config=config_model).save_pretrained(config['model_save_path'])

del config_model

Some weights of BigBirdForTokenClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Initialize Dataset, Dataloader, and tokenizer

In [9]:
train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                # 'num_workers': 2,
                }

validation_params = {'batch_size': config['valid_batch_size'],
               'shuffle': False,
            #    'num_workers': 2,
               }

In [10]:
tokenizer = AutoTokenizer.from_pretrained(config['model_save_path'])

training_set = EssayDataset(df=train_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=False)
training_loader = DataLoader(training_set, **train_params)

testing_set = EssayDataset(df=val_df, tokenizer=tokenizer, max_len=config['max_length'], get_word_ids=True)
validation_loader = DataLoader(testing_set, **validation_params)

In [19]:
training_set[0]['labels'].view(-1)

tensor([-100,    1,    2,  ..., -100, -100, -100])

Save DataLoaders to be able to quickly load them later.

In [12]:
# torch.save(training_loader, 'training_loader.pth')
# torch.save(validation_loader, 'validation_loader.pth')

### Initialize all necesary models/objects

In [13]:
config_model = AutoConfig.from_pretrained(f"{config['model_save_path']}config.json") 
model = AutoModelForTokenClassification.from_pretrained(
                   f"{config['model_save_path']}model.safetensors",config=config_model).to(config['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

### Training loop

In [14]:
from sklearn.metrics import accuracy_score

def train_one_epoch(epoch, verbose=False):
    # Since we're batching, and the last batch may not be the full size,
    # keep track of precise # steps and # seen examples for metrics computations.
    epoch_loss = 0
    epoch_accuracy = 0
    num_train_examples = 0
    num_train_steps = 0
    
    # Set model to training mode (torch).
    model.train()
    
    for idx, batch in tqdm(enumerate(training_loader), total=len(training_loader)): # TODO: Customize tqdm for better displaying.
        
        # The dictionary returned by trainin_loader has 3 keys: input_ids, attention_mask (both made by the Tokenizer),
        # and labels (which we add in the Dataset abstraction, since this is a training run).
        input_ids = batch['input_ids'].to(config['device'], dtype=torch.long)
        attention_mask = batch['attention_mask'].to(config['device'], dtype=torch.long)
        labels = batch['labels'].to(config['device'], dtype=torch.long)

        # Run batch through model.
        loss, train_logits = model(input_ids=input_ids,
                                   attention_mask=attention_mask,
                                   labels=labels,
                                   return_dict=False)
        
        # Increment epoch loss by this batch's loss.
        epoch_loss += loss.item()

        # Increment counters.
        num_train_steps += 1
        num_train_examples += labels.size(0)
        
        # Debugging.
        if verbose and (idx % 100 == 0):
            print(f"Idx: {idx:04d}, step loss: {epoch_loss/num_train_steps}")
           
        # Compute training accuracy
        flattened_logits = train_logits.view(-1, model.num_labels) # (batch_size, sequence_length, num_labels) -> (batch_size * seq_length, num_labels)
        flattened_predictions = torch.argmax(flattened_logits, axis=1) # Find predicted label.
        label_mask = labels.view(-1) != -100 # If our label is -100 (as sdiscussed above), it should be ignored as it is a special token.
        
        # Mask both predictions and ground truth.
        labels = torch.masked_select(input=labels.view(-1), mask=label_mask)
        predictions = torch.masked_select(input=flattened_predictions, mask=label_mask)

        # Use accuracy function from sklear.
        epoch_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    
        # Gradient clipping.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config['max_grad_norm'])
        
        # Finally, optimize.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Model and dataset is pretty big, so alleviate some GPU memory usage by releasing unused cached data at every batch.
        torch.cuda.empty_cache()

    # Normalize loss and accuracy by num steps.
    epoch_loss /= num_train_steps
    epoch_accuracy /= num_train_steps

    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {epoch_accuracy}")

#### Finally, train model

In [16]:
if not os.path.exists(config['model_chkpt_path']):
    os.mkdir(config['model_chkpt_path'])

for epoch in range(config['epochs']):
    train_one_epoch(epoch)
    torch.cuda.empty_cache()

    torch.save(model.state_dict(), f"{config['model_chkpt_path']}{config['model_name'].replace('/', '-')}-maxlen={config['max_length']}-batchsize={config['train_batch_size']}-lr={str(config['learning_rates'])}-maxgrad={config['max_grad_norm']}-epoch={epoch}")

100%|██████████| 3314/3314 [36:59<00:00,  1.49it/s]


Training loss epoch: 0.7546362035790336
Training accuracy epoch: 0.7529057776726666


100%|██████████| 3314/3314 [36:53<00:00,  1.50it/s]


Training loss epoch: 0.5935034313580324
Training accuracy epoch: 0.7960994502283752


100%|██████████| 3314/3314 [36:56<00:00,  1.49it/s]


Training loss epoch: 0.5060606728508555
Training accuracy epoch: 0.824042095085337


100%|██████████| 3314/3314 [36:54<00:00,  1.50it/s]


Training loss epoch: 0.4273301211821436
Training accuracy epoch: 0.8491220668471002


100%|██████████| 3314/3314 [36:56<00:00,  1.50it/s]


Training loss epoch: 0.3547257536580459
Training accuracy epoch: 0.8746208288345277


In [None]:
torch.cuda.empty_cache()