### This notebook is just an example of the training of one of my bert models. It uses decay learning rate instead of layer wise learning rate (mostly used in other bert models). I tried a lot of different heads for deberta but the best performing one seems to be a Mean Pooling head.

# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np

import random
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import os
from collections import defaultdict
from tqdm import tqdm_notebook

from transformers import AutoConfig, AutoTokenizer, AutoModel
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction
import math

from torch.utils.data import Sampler, Dataset, DataLoader
import random

from more_itertools import chunked, flatten

In [None]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


# Get folds

In [None]:
df = pd.read_csv("train_folds.csv")

# Seed Everything

In [None]:
def seed_everything(seed=12):

  """
  Try to make the result as reproducible as possible.
  """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=12)

# Configuration

In [None]:
MAX_LEN = 256
EPOCHS = 3
DEBERTA_PATH = "microsoft/deberta-large"
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
TOKENIZER = transformers.AutoTokenizer.from_pretrained(DEBERTA_PATH)

# Evaluation Scheduler

In [None]:
def evaluate(EVAL_STEPS,valid_interval, valid_loss, train_loss, final_train_loss, index, best_loss, epoch):

  """
  Just a function to follow the training process and save the best performing one.

  Args:
    EVAL_STEPS (list) : each validation rmse corresponds to a specific valid_interval
    valid_interval (int) : number of intervals between which we validate the model and save the best performing one
    valid_loss (float) : validation loss of the current iteration
    train_loss (float) : training loss
    final_train_loss (float) : average of the current loss over all the iterations
    index (int) : number of iterations
    best_loss (float) : best validation loss
    epoch (int) : epoch number

  """
    
        print(f"Epoch:{epoch}| Batch {index} | Train Loss:{train_loss.avg()} | Validation loss:{valid_loss}")
        if (valid_loss < best_loss):
            
            for rmse, steps in EVAL_STEPS:
                if valid_loss > rmse:
                    valid_interval = steps
                    break
              
            print(f"Validation loss decreased from {best_loss} to {valid_loss}.")
            final_train_loss = train_loss.avg()
            best_loss = valid_loss
            torch.save(model.state_dict(),f'Training/Models/Deberta/model{fold}.bin')

            
        return valid_interval, best_loss, final_train_loss

# Training Function

In [None]:
def train_fn(train_dataloader, valid_dataloader, model, optimizer, device, scheduler):

  """
  Pytorch function to train a model and validate it. We do not validate the model at the end of each epoch but at the end of 
  a specific number of iterations. (cf. Readme file of github)

  Args : 
      train_dataloader (pytorch dataloader) : training dataloader
      valid_dataloader (pytorch dataloader) : validation dataloader
      model (nn.Module) : the model that you want to train
      optimizer : optimizer
      device : cpu or gpu
      scheduler : scheduler
  """
    

    EVAL_STEPS = [(0.50,400),(0.49,400), (0.48, 200), (-1., 200)]
    valid_interval = EVAL_STEPS[0][1]
    best_loss = np.inf
    final_train_loss = None
    accumulation_steps = 4
    lr_schedule = [2e-5, 5e-6, 2e-6]

    for epoch in range(EPOCHS):
      
        train_loss = AvgCounter()
        lr = lr_schedule[epoch]
        optimizer = scheduler(optimizer,lr)

        for index, d in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader)):
  
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            model.train()
            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            train_loss.update(loss.item(), len(d))
            loss = loss / accumulation_steps 
            loss.backward()

            if index % accumulation_steps == 0:             
                optimizer.step() 
                # scheduler.step()                           
                optimizer.zero_grad()

            if (index % valid_interval == 0) | ((len(train_dataloader)-index) == 1):

                valid_loss = eval_fn(valid_dataloader,model,device)
                
                valid_interval, best_loss, final_train_loss = evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch )
            
    return final_train_loss, best_loss

# Evaluation Function

In [None]:
def eval_fn(data_loader, model, device):
  """
  Evaluate the performance of our model.

  -------------------
  Args:
      dataloader (pytorch dataloader) : validation dataloader
      model (nn.Module) : the model you are training
      device : cpu or gpu
  -------------------
  Returns:
      Validation loss
  """
    model.eval()
    valid_loss = AvgCounter()
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            valid_loss.update(loss.item(), len(d))
            
    return valid_loss.avg()

# Dataset

In [None]:
class DebertaDataset:
  """
  Simple pytorch dataset class using deberta tokenizer from hugging face
  """
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens = True, max_length = MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
        token_type_ids = token_type_ids+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

# Model Building

In [None]:
class DebertaModel(nn.Module):

  """
    Simple Deberta Model with two possible heads: Mean pooling or attention pooling.
  """
    
    def __init__(self, model_type="mean"):
        super(DebertaModel,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(DEBERTA_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        
        self.deberta = transformers.AutoModel.from_pretrained(DEBERTA_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 1024),            
            nn.Tanh(),                       
            nn.Linear(1024, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.deberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.deberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
            logits = self.linear(context_vector)

        if targets is not None:

            loss = torch.sqrt(loss_fn(logits.view(-1),targets.view(-1)))
            return loss, logits

        else:

            return logits

# AvgCounter

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

# Useful Functions

In [None]:
# create pytorch dataloader
def create_dataloader(df, fold):

  """
  Create the training and validation dataloader for a specific fold number.
  The training set is composed of the training set from Commonlit and also external data.

  -----------------------
  Args:
      df (dataframe) : dataframe with all sentences (commonlit data)
      fold (int) : fold number
  -----------------------
  Returns:
      training dataloader and validation dataloader

  """
    
    train = df[df.kfold!=fold].reset_index(drop=True)
    print(train.shape)
    x = pd.read_csv(f"External Data/pseudo_labels_fold_queries_{fold}.csv")
    x = x[x.to_keep==1]
    x = x[["sentences", "predictions", "stdev"]]
    x.columns = ["excerpt", "target", "standard_error"]
    x["kfold"] = None
    train = pd.concat([train, x], axis = 0)
    print(train.shape)
    valid = df[df.kfold==fold].reset_index(drop=True)

    train_dataset = DebertaDataset(train)
    valid_dataset = DebertaDataset(valid)

    sampler = torch.utils.data.RandomSampler(train_dataset)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler  )
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE)

    return train_dataloader, valid_dataloader

In [None]:
def create_model(device):

    """
    Create the model and put it on a specific device
    """

    model = DebertaModel().to(device)
    
    return model

In [None]:
# create the optimizer
def create_optimizer(model):
  """
  Implementation of layer wize learning rate for a deberta model.

  Returns:
      AdamW optimizer with a specific learning rate for each layer of the deberta model
  """
    named_parameters = list(model.named_parameters()) 
    no_decay = ['bias', 'gamma', 'beta']   
    
    parameters = []
    lr = 3e-5
    regressor_lr = 2e-5
    for layer in range(23,-1,-1):
        layer_params = {
          'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'encoder.layer.{layer}.' in n)
                      ],
          'lr': lr
      }
        parameters.append(layer_params)

        lr *= 0.975

    regressor_params = {
      'params': [p for n,p in model.named_parameters() if "deberta" not in n],
      'lr': regressor_lr
    }

    parameters.append(regressor_params)

    regressor_params = {
      'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'deberta.embeddings' in n)
                      ],
      'lr': regressor_lr
    }
    parameters.append(regressor_params)

    return AdamW(parameters)

In [None]:
# create scheduler
def create_scheduler(optimizer, num_warmup_steps, num_train_steps, scheduler_name = "get_cosine_schedule_with_warmup" ):

    if scheduler_name == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
        
    elif scheduler_name == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(optimizer,num_training_steps=num_train_steps,num_warmup_steps=50) 
        
    else:
        raise Exception(f"Unknown scheduler: {scheduler_name}")

    return scheduler

In [None]:
def scheduler(optimizer,lr):
  """
  Fast and simple implementation of decay learning rate.
  At each epoch we change the learning rate. the learning rate is an hyper parameter that we need to tune.
  """
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

# Training

In [None]:
loss_fn=nn.MSELoss()
loss=defaultdict(list)
results_val = {}
results_train = {}
for fold in range(5):

    seed_everything(42)
    
    device = torch.device("cuda")
    model = create_model(device)

    print("################################")
    print(f"Training Fold {fold}")
    print("################################")

    train_dataloader, valid_dataloader = create_dataloader(df, fold)
    num_train_steps = len(train_dataloader) * EPOCHS

    # not useful as we are using learning rate decay this time
    optimizer = create_optimizer(model)
    # scheduler = create_scheduler(optimizer, num_warmup_steps = 0, num_train_steps = num_train_steps )

    seed_everything(42)
    
    results_train[fold], results_val[fold] = train_fn(train_dataloader,valid_dataloader, model, optimizer, device, scheduler)

print("################################")
print("RESULTS")
print("################################")
cv_val = np.mean([results_val[i] for i in range(5)])
cv_train = np.mean([results_train[i] for i in range(5)])
print(f"Results of cross validation for seed 12: Train : {cv_train}, Val : {cv_val}") # I always used seed 12