Description
====

This Notebook demonstartes how to use the implemented models. As an example, the models are trained for binary classification on the [HuggingFace imdb dataset](https://huggingface.co/datasets/imdb).

In [1]:
!pip install datasets &> /dev/null
!pip install transformers &> /dev/null

In [2]:
import gc
import numpy as np
import random

import datasets
import tokenizers 
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers


# Auxiliary Functions


In [None]:
def set_seed(seed_val):
    """Sets seed for reproducibility.
    Args:
      seed_val: (int) Seed for rng.
    """
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    transformers.set_seed(seed_val)


def get_device():
    """Returns Cuda device if it is available.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        print(f'Available GPU: {torch.cuda.get_device_name(0)}.')
    else:
        print('GPU unavailable.')

    return device


def padding_collator(batch):
    """Truncates a batch, to its longest sequence.
    
    Args:
      batch: (list) Each element represents a single sample and consits of 
        a dict with keys 'input_ids', 'input_ids', 'label'.
        
    Returns:
      batch: (dict) Contains 4 elements: 
        input_ids: (torch.Tensor of long)[batch_size, max_len]
        attention_mask: (torch.Tensor of long)[batch_size, max_len]. 
          Mask that takes the value 1 for valid tokens and 0 for padded ones.
        labels: (torch.Tensor of float32)[batch_size]
        lengths: (torch.Tensor of long)[batch_size]
    """
    lengths = [sum(sample['attention_mask']) for sample in batch]
    max_len = max(lengths)
    input_ids = [sample['input_ids'][:max_len] for sample in batch]
    input_ids = torch.tensor(input_ids).long()
    attention_mask = [sample['attention_mask'][:max_len] for sample in batch]
    attention_mask = torch.tensor(attention_mask).long()
    labels = [sample['label'] for sample in batch]
    labels = torch.tensor(labels).float()
    lengths = torch.tensor(lengths).long()

    batch = {
        'input_ids': input_ids, 
        'attention_mask': attention_mask, 
        'lengths': lengths,
        'labels': labels
    }
    return batch

def free_memory():
    """(Maybe) prevents Cuda running out of memory
    """
    gc.collect()
    torch.cuda.empty_cache()

class Garbage_collector_callback(transformers.TrainerCallback):
    """Custom callback that (maybe) prevents Cuda running out of memory.
    I have absolutely no idea if this actually helps. However, Cuda on Colab
    is prone to memory leaks, especially in case of Ctrl + C interrupts. 
    After using this callback the issue kinda disappeared. Code based on 
    https://huggingface.co/transformers/main_classes/callback.html
    """

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Called every time the Trainer logs data.
        """
        res_before = torch.cuda.memory_reserved(0)
        free_memory()
        res_after = torch.cuda.memory_reserved(0)
        freed = res_before - res_after
        print(f'Freed {freed}.')

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def compute_metrics(p):
    """Calculates accuracy metric.

    Stolen from:
    https://github.com/huggingface/transformers/blob/v4.5.1/examples/text-classification/run_glue.py 
    """
    preds = (p.predictions >= 0)
    accuracy = (preds == p.label_ids).astype(np.float32).mean().item()
    metrics = {'accuracy': accuracy}
    
    return metrics

In [None]:
seed_val = 42
set_seed(seed_val)
device = get_device()

# Classification Model

In [None]:
from models.kernel_transformer import Kernel_transformer
from models.baseline import Baseline_transformer

class Binary_clf_model(nn.Module):

    def __init__(self, model_type, model_args):
        super(Binary_clf_model, self).__init__()
        if model_type == 'kernel':
            self.encoder = Kernel_transformer(**model_args)
        elif model_type == 'baseline':
            self.encoder = Baseline_transformer(**model_args)
        else:
            raise NotImplementedError(
                "The only options for model_type are 'kernel' and 'baseline'.")

        self.linear = nn.Linear(model_args['d_model'], 1)
        self.loss_fn = F.binary_cross_entropy_with_logits

    def forward(self, input_ids, labels, attention_mask=None, lengths=None):
        x = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            lengths=lengths)[:, 0, :]
        # x -> [batch_size, d_model]

        x = self.linear(x)
        # x -> [batch_size, 1]
        x = x.squeeze(-1)
        # x -> [batch_size]
        loss = self.loss_fn(x, labels)

        return [loss, x]


# Dataset & Tokenizer

In [None]:
raw_dataset = datasets.load_dataset('imdb', split=['train', 'test'])

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
print(f'Vocab size = {tokenizer.vocab_size}')

28996


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = raw_dataset[0].map(tokenize_function, batched=True)
test_dataset = raw_dataset[1].map(tokenize_function, batched=True)

train_dataset = train_dataset.shuffle(seed=seed_val)
test_dataset = test_dataset.shuffle(seed=seed_val)

# Training

In [None]:
model_args = {
    'use_cos': True,
    'kernel': 'relu',
    'd_model': 384,
    'n_heads': 6,
    'n_layers': 5,
    'n_emb': tokenizer.vocab_size,
    'ffn_ratio': 4,
    'rezero': True,
    'ln_eps': 1e-5,
    'denom_eps': 1e-5,
    'bias': False,
    'dropout': 0.2,    
    'max_len': 512,
    'xavier': True,
}

training_args = {
    # Dirs
    'output_dir': 'results',          
    'logging_dir': 'logs',            
    'num_train_epochs': 10,              
    'per_device_train_batch_size': 8,  
    'per_device_eval_batch_size': 8,   
    # Strategies
    'evaluation_strategy': 'epoch',     
    'logging_strategy': 'epoch',
    'save_strategy': 'no',
    # steps
    'logging_steps': int(1e3),               
    'eval_steps': int(1e3),       
    'warmup_steps': 300,                
    'learning_rate': 2e-4,
    'log_level': 'info',
    'seed': seed_val,
    'disable_tqdm': False,
    # Optimizations
    'group_by_length': True, 
}

training_args_ = transformers.TrainingArguments(**training_args)

## Kernel Transformer

In [None]:
kernel_model = Binary_clf_model(model_type='kernel', model_args=model_args).to(device)
print(f'{count_parameters(kernel_model)} params')

kernel_trainer_args = {
        'model': kernel_model,
        'args': training_args_,
        'train_dataset': train_dataset,
        'eval_dataset': test_dataset,
        'data_collator': padding_collator,
        'callbacks': [Garbage_collector_callback],
        'compute_metrics': compute_metrics,
}

kernel_trainer = transformers.Trainer(**kernel_trainer_args)

In [None]:
kernel_trainer.train()

## Baseline Model

In [None]:
baseline_model = Binary_clf_model(model_type='baseline', model_args=model_args).to(device)
print(f'{count_parameters(baseline_model)} params')

baseline_trainer_args = {
        'model': baseline_model,
        'args': training_args_,
        'train_dataset': train_dataset,
        'eval_dataset': test_dataset,
        'data_collator': padding_collator,
        'callbacks': [Garbage_collector_callback],
        'compute_metrics': compute_metrics,
}

baseline_trainer = transformers.Trainer(**baseline_trainer_args)

In [None]:
baseline_trainer.train()