In [15]:
print("[INFO]: Importing libraries.")
from pathlib import Path
import pandas as pd
import numpy as np
import math
import pickle

import torch
import torch.nn as nn

from transformers import (
    AutoModelForSequenceClassification, AutoConfig, 
    AutoTokenizer,
    AdamW, get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    IntervalStrategy,
    Trainer, TrainingArguments
)

from transformers.modeling_outputs import SequenceClassifierOutput

print("[INFO]: Defining custom classes and functions.")

# configuration
CONFIG = {
    "model_name": "distilbert-base-uncased",
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "max_length": 512,
    "train_batch_size": 16, # dataset[train] is 739 long. That gives 93 batches/steps (last batch only has 3 data entries)
    "valid_batch_size": 158, # 16 originally
    "epochs": 2,
    "max_grad_norm": 1000,
    "weight_decay": 1e-6, # Btwn 0-0.1. "The higher the value, the less likely your model will overfit. However, if set too high, your model might not be powerful enough."
    "learning_rate": 2e-5,
    "loss_type": "rmse",
    "n_accumulate" : 1,
    "label_cols" : ['Coherence', 'Empathy', 'Surprise', 'Engagement', 'Complexity'],
    "early_stopping_patience": 2,
    "early_stopping_threshold": 0.001,
    "seed": 50 
    
}

def tokenize(examples):
    '''
    A function to be used with the map method of the dataset class. 
    Tokenizes the text and returns a dictionary of tensors.
    '''
    labels = examples['label']
    tokens = tokenizer(examples['text'], 
                       padding='max_length', 
                       truncation=True, 
                       max_length=CONFIG['max_length'], 
                       return_tensors='pt',
                       return_attention_mask=True)
    res = {
        'input_ids': tokens['input_ids'].to(CONFIG.get('device')).squeeze(),
        'attention_mask': tokens['attention_mask'].to(CONFIG.get('device')).squeeze(),
        'labels': torch.tensor(labels)
    }

    return res

def compute_metrics(eval_pred):
    '''
    A custom function that allows calculating the RMSE of each of the six metrics separately.
    '''
    predictions, labels = eval_pred
    colwise_rmse = np.sqrt(np.mean((labels - predictions) ** 2, axis=0))
    res = {
        f"{analytic.upper()}_RMSE" : colwise_rmse[i]
        for i, analytic in enumerate(CONFIG["label_cols"])
    }
    res["MCRMSE"] = np.mean(colwise_rmse)
    return res

class RegressionModel(nn.Module):
    '''
    A custom model that takes a pretrained model and adds a dropout layer and a linear layer on top.
    '''
    def __init__(self, model_name):
        super(RegressionModel, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.hidden_dropout_prob = 0
        self.config.attention_probs_dropout_prob = 0
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(self.config.hidden_size, len(CONFIG['label_cols']))
        
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, # out should be of type SequenceClassifierOutput
                        attention_mask=attention_mask, 
                        output_hidden_states=True)
        cls_token = out.hidden_states[-1][:, 0, :].to(CONFIG.get('device'))
        out = self.drop(cls_token )
        outputs = self.fc(out) # outputs should be regression scores
        return SequenceClassifierOutput(logits=outputs)

class RMSELoss(nn.Module):
    """
    Defines the loss function to be fed into the CustomTrainer.
    Code taken from Y Nakama's notebook (https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train)
    """
    def __init__(self, eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='mean')
        self.eps = eps

    def forward(self, predictions, targets):
        loss = torch.sqrt(self.mse(predictions, targets) + self.eps)
        return loss

class CustomTrainer(Trainer):
    '''
    A custom trainer class that overwrites the compute_loss method of Trainer to use RMSE loss
    '''
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(inputs['input_ids'], inputs['attention_mask']) # model outputs are of type SequenceClassifierOutput
        loss_func = RMSELoss()
        loss = loss_func(outputs.logits.float(), inputs['labels'].float()) # predictions, targets
        return (loss, outputs) if return_outputs else loss



if __name__ == '__main__':

    # paths
    data_path = "../story_eval_dataset_dict.pkl"
    models_path = "../models/run2"

    # check if models_path exists, if not create it
    #if not models_path.exists():
    #    models_path.mkdir(parents=True, exist_ok=True)

    # tokenize data
    with open(data_path, 'rb') as f:
        dataset = pickle.load(f)

    tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

    print('[INFO]: Tokenizing data.')
    for split in dataset.keys():
        dataset[split] = dataset[split].map(tokenize)

    # calculate bactches per epoch
    bacthes_per_epoch = math.ceil(len(dataset['train'])/(CONFIG['train_batch_size'] * CONFIG['n_accumulate']))

    # define the training arguments
    training_args = TrainingArguments(
        output_dir=models_path,
        evaluation_strategy=IntervalStrategy.STEPS,
        save_strategy=IntervalStrategy.STEPS, # save checkpoint for each save_steps
        eval_steps=bacthes_per_epoch, # compute metrics after each epoch
        save_steps=bacthes_per_epoch,
        logging_steps=bacthes_per_epoch,
        logging_first_step=False,
        logging_dir=models_path,
        per_device_train_batch_size=CONFIG['train_batch_size'],
        per_device_eval_batch_size=CONFIG['valid_batch_size'],
        num_train_epochs=CONFIG['epochs'],
        learning_rate=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay'],
        gradient_accumulation_steps=CONFIG['n_accumulate'],
        use_cpu=True if CONFIG['device'] == 'cpu' else False,
        use_ipex=True if CONFIG['device'] == 'cpu' else False,
        bf16=True if CONFIG['device'] == 'cpu' else False,
        seed=CONFIG['seed'],
        group_by_length=True,
        max_grad_norm=CONFIG['max_grad_norm'],
        metric_for_best_model='eval_MCRMSE',
        load_best_model_at_end=True, # always save best checkpoint at end of training. May exceed save_total_limit if best and last model are different.
        greater_is_better=False,
        save_total_limit=1,
        label_names=["labels"] 
    )

    # data collator for dynamic padding
    collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

    # define early stopping criteria
    early_stop = EarlyStoppingCallback(early_stopping_patience = CONFIG['early_stopping_patience'], 
                                       early_stopping_threshold = CONFIG['early_stopping_threshold'])

    # init model
    model = RegressionModel(CONFIG['model_name'])
    model.to(CONFIG['device'])

    # count number of trainable params (total and in head)
    #total = sum(p.numel() for p in model.parameters() if p.requires_grad)
    #modelhead = nn.Linear(model.config.hidden_size, len(CONFIG['label_cols']))
    #head = sum(p.numel() for p in modelhead.parameters() if p.requires_grad)

    # SET THE OPITMIZER AND THE SCHEDULER
    # no decay for bias and normalization layers
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
    {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], # get all the params except those in no_decay
            "weight_decay": CONFIG['weight_decay'],
    },
    {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], # get all the params that are in no_decay
            "weight_decay": 0.0,
    },
    ]
    optimizer = AdamW(optimizer_parameters, lr=CONFIG['learning_rate'])
    
    num_training_steps = bacthes_per_epoch * CONFIG['epochs']
    #num_training_steps = (len(dataset['train']) * CONFIG['epochs']) // (CONFIG['train_batch_size'] * CONFIG['n_accumulate'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.1*num_training_steps,
        num_training_steps=num_training_steps
    )

    # init trainer
    trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['validation'],
            data_collator=collate_fn,
            optimizers=(optimizer, scheduler),
            compute_metrics=compute_metrics,
            callbacks=[early_stop])
    
    # train
    print("[INFO]: Training model.")
    trainer.train()

[INFO]: Importing libraries.
[INFO]: Defining custom classes and functions.
[INFO]: Tokenizing data.


Map: 100%|██████████| 739/739 [00:00<00:00, 897.87 examples/s]
Map: 100%|██████████| 159/159 [00:00<00:00, 868.30 examples/s]
Map: 100%|██████████| 158/158 [00:00<00:00, 845.10 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO]: Training model.


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [16]:
trainer.state.log_history

[{'loss': 1.1758,
  'learning_rate': 1.1111111111111113e-05,
  'epoch': 1.0,
  'step': 47},
 {'eval_loss': 0.6899921298027039,
  'eval_COHERENCE_RMSE': 0.7001960277557373,
  'eval_EMPATHY_RMSE': 0.6809112429618835,
  'eval_SURPRISE_RMSE': 0.6896708607673645,
  'eval_ENGAGEMENT_RMSE': 0.6782934665679932,
  'eval_COMPLEXITY_RMSE': 0.7005736827850342,
  'eval_MCRMSE': 0.6899290680885315,
  'eval_runtime': 19.4441,
  'eval_samples_per_second': 8.126,
  'eval_steps_per_second': 0.051,
  'epoch': 1.0,
  'step': 47},
 {'loss': 0.6679, 'learning_rate': 0.0, 'epoch': 2.0, 'step': 94},
 {'eval_loss': 0.6900824904441833,
  'eval_COHERENCE_RMSE': 0.6998785138130188,
  'eval_EMPATHY_RMSE': 0.6810686588287354,
  'eval_SURPRISE_RMSE': 0.689216136932373,
  'eval_ENGAGEMENT_RMSE': 0.6788548231124878,
  'eval_COMPLEXITY_RMSE': 0.7010861039161682,
  'eval_MCRMSE': 0.6900208592414856,
  'eval_runtime': 19.1825,
  'eval_samples_per_second': 8.237,
  'eval_steps_per_second': 0.052,
  'epoch': 2.0,
  'step':

In [17]:
from transformers.modelcard import parse_log_history
log_history = parse_log_history(trainer.state.log_history)
log_history

({'train_runtime': 703.6932,
  'train_samples_per_second': 2.1,
  'train_steps_per_second': 0.134,
  'total_flos': 0.0,
  'train_loss': 0.9218511378511469,
  'epoch': 2.0,
  'step': 94},
 [{'Training Loss': 1.1758,
   'Epoch': 1.0,
   'Step': 47,
   'Validation Loss': 0.6899921298027039,
   'Coherence Rmse': 0.7001960277557373,
   'Empathy Rmse': 0.6809112429618835,
   'Surprise Rmse': 0.6896708607673645,
   'Engagement Rmse': 0.6782934665679932,
   'Complexity Rmse': 0.7005736827850342,
   'Mcrmse': 0.6899290680885315},
  {'Training Loss': 0.6679,
   'Epoch': 2.0,
   'Step': 94,
   'Validation Loss': 0.6900824904441833,
   'Coherence Rmse': 0.6998785138130188,
   'Empathy Rmse': 0.6810686588287354,
   'Surprise Rmse': 0.689216136932373,
   'Engagement Rmse': 0.6788548231124878,
   'Complexity Rmse': 0.7010861039161682,
   'Mcrmse': 0.6900208592414856}],
 {'Loss': 0.6900824904441833,
  'Coherence Rmse': 0.6998785138130188,
  'Empathy Rmse': 0.6810686588287354,
  'Surprise Rmse': 0.6892

In [21]:
with open("../models/run3/log_history", 'rb') as f:
    log_history = pickle.load(f)

In [25]:
log_history[1][-1]

{'Training Loss': 0.8029,
 'Epoch': 10.0,
 'Step': 240,
 'Validation Loss': 0.8540864586830139,
 'Coherence Rmse': 0.9171026349067688,
 'Empathy Rmse': 0.8103721141815186,
 'Surprise Rmse': 0.7883448004722595,
 'Engagement Rmse': 0.8322986364364624,
 'Complexity Rmse': 0.9139639735221863,
 'Mcrmse': 0.8524163961410522}