# Final Training Loop

Authors: Complicated, the original code was taken from internet, then it was modified by us

Copyright: Complicated, see "Authors". Hope Dyna Group will figure it out

## Preparations

In [3]:
# IMPORTANT! Everyone should specify unique (!) name for her/his model
THIS_MODEL_NAME = "T5-small-combined_fatma_dataset" # v1 - version 1

In [4]:
import argparse
import glob
import json
import os
import sys
import time
import logging
import random
import re

from itertools import chain
from string import punctuation
from tqdm import tqdm
from torch import cuda
from collections import defaultdict

import pandas as pd
import numpy as np

import torch

from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
    )

In [6]:
# Add logging 
logging.basicConfig(filename=f'{THIS_DIR}/logs/train.log', level=logging.INFO)
logging.info('Started')

In [8]:
# Upload datasets
train_df = pd.read_csv(f"{THIS_DIR}/data/diverse_train.csv")
eval_df = pd.read_csv(f"{THIS_DIR}/data/diverse_val.csv")
train_df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sentence,paraphrase
0,6473,6473,How can I overcome the fear of failure?,How do I cope up with fear of failure?
1,4425,4425,"On Monday, EchoStar (DISH: news, chart, profil...","Shares of Littleton , Colorado-based EchoStar ..."
2,6574,6574,His protest led to a 47-hour standoff with pol...,His protest triggered a 47-hour standoff with ...


Load metrics and check that they work

In [None]:
# Add metrics
THIS_DIR = os.path.dirname(os.path.realpath('file'))
sys.path.append(f"{THIS_DIR}/src")

In [None]:
from our_metrics import Metrics, BleurtModelsLinks

the_metrics = Metrics()

In [None]:
# You have to execute it only one time to download bleur model
# the_metrics.install_bleurt_model(bleurt_model_link=BleurtModelsLinks.BLEURT_20)

In [9]:
# Will take some time...
# data = the_metrics.get_dummy_data(1)
# the_metrics.compute_metrics(data, data, data)
# the_metrics.test_bert(1)

In [10]:
# Please test bleurt before using it (takes time)
# If it crashes, install smaller version (see line 2 in this cell)
# if cuda.is_available():
#     the_metrics.compute_bleurt(eval_df[:8].sentence, eval_df[:8].paraphrase, 8)

In [11]:
# Check nvidia, how much cuda memory you have
!nvidia

Wed Nov 24 23:52:55 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   50C    P0    80W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

Configure rich console for logging

In [12]:
from rich.table import Column, Table
from rich import box
from rich.console import Console

console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage
device = 'cuda' if cuda.is_available() else 'cpu'

## Code for training

In [13]:
# Paraphraser Dataset which we will use to load the data
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, filename, source_col_name, target_col_name, data_frame, max_len=64):
        
        self.source_column = source_col_name
        self.target_column = target_col_name
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        if data_dir is None and filename is None and data_frame is not None:
            self.data = data_frame
        else:
            self.path = os.path.join(data_dir, filename + '.csv')
            self.data = pd.read_csv(self.path) 
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        print("Start of Building the dataloader: tokenization")
        for idx in tqdm(range(len(self.data))):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            input_ = "paraphrase: "+ str(input_) + ' </s>'
            target = str(target) + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)
        print("End of Building the dataloader: tokenization")

In [14]:
# Optimizer
def get_optimizer(model, learning_rate, adam_epsilon=1e-6, weight_decay=0.0):
    """Prepare optimizer"""

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(model.parameters(), lr=model_params["LEARNING_RATE"], 
                    eps=model_params["ADAM_EPSILON"])
          
    return AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

In [15]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(tqdm(loader), 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 5000 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    del y, y_ids, lm_labels, ids, mask, outputs, loss

def validate(tokenizer, model, device, loader, verbose=True):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  inputs = []
  with torch.no_grad():
      for _, data in enumerate(tqdm(loader), 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask,
              max_length=150,
              num_beams=2,
              repetition_penalty=2.5,
              length_penalty=1.0,
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          input = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True)for i in ids]

          input = [text.replace("paraphrase: ", '').replace(' </s>', '') for text in input]
          target = [text.replace(" </s>", '') for text in target]

          if _%5000==0 and verbose:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
          inputs.extend(input)

  del y, ids, mask, generated_ids

  return predictions, actuals, inputs

In [16]:
# metrics_dict should be defaultdict(list)!
def add_to_metrics_dict(new_metrics, metrics_dict):
  for key, value in new_metrics.items():
    metrics_dict[key].append(value)

def print_metrics_dict(metrics_dict):
  for key, value_array in metrics_dict.items():
    value_array = [float(f"{n:.4f}") for n in value_array]
    print(f"{key}: {value_array}")

In [17]:
def T5Trainer(
    train_dataset, val_dataset, source_text, target_text, model_params, output_dir="./outputs",
    model_name = THIS_MODEL_NAME, checkpoint_path=None
):

    """
    T5 trainer

    """
    results_dir = f"{output_dir}/results"
    checkpoints_dir = f"{output_dir}/checkpoints"
    models_dir = f"{output_dir}/models"

    if not os.path.exists(checkpoints_dir):
      os.makedirs(checkpoints_dir)

    if not os.path.exists(models_dir):
      os.makedirs(models_dir)

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # If checkpoint is not given, initialize from scratch
    if checkpoint_path is None:

      # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
      # Further this model is sent to device (GPU/TPU) for using the hardware.
      model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])

      # Defining the optimizer that will be used to tune the weights of the network in the training session.
      USE_PREPEPATED_OPTIMIZER = True
      if USE_PREPEPATED_OPTIMIZER:
        # This optimizer has weight decay for some params
        optimizer = get_optimizer(model, model_params["LEARNING_RATE"], model_params["ADAM_EPSILON"],
                                          model_params["WEIGHT_DECAY"])
      else:
        optimizer = AdamW(model.parameters(), lr=model_params["LEARNING_RATE"], 
                        eps=model_params["ADAM_EPSILON"])
      
      # Required staff
      metrics_dict = defaultdict(list)
      start_epoch = 0

    else:
      # Load data from checkoint
      checkpoint = torch.load(checkpoint_path)
      start_epoch = checkpoint['epoch'] + 1
      model = checkpoint['model']
      optimizer = checkpoint['optimizer']
      metrics_dict = checkpoint['metrics_dict']

    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    # tokenizer, data_dir, filename, max_len=64, source_col_name, target_col_name, data_frame)
    training_set = ParaphraseDataset(
        tokenizer,
        data_dir = None,
        filename = None,        
        source_col_name = source_text,
        target_col_name = target_text,
        data_frame= train_dataset,
        max_len = model_params["MAX_SOURCE_TEXT_LENGTH"],

    )
    val_set = ParaphraseDataset(
        tokenizer,
        data_dir = None,
        filename = None,
        source_col_name = source_text,
        target_col_name = target_text,
        data_frame = val_dataset,
        max_len= model_params["MAX_SOURCE_TEXT_LENGTH"],
    )

    torch.save(
      {
        "training_set": training_set,
        "val_set": val_set
      }
      , 
      os.path.join(output_dir, f"sets.pth")
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")
    for epoch in range(start_epoch, model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

        # Create current epoch directory.
        current_model_results_dir = os.path.join(results_dir, f"{model_name}-epoch-{epoch}")
        if not os.path.exists(current_model_results_dir):
          os.makedirs(current_model_results_dir)

        # Saving predictions and staff
        console.log(f"[Saving Predictions]...\n")
        predictions, actuals, inputs = validate(tokenizer, model, device, val_loader, verbose=False)
        final_df = pd.DataFrame({"input": inputs, "prediction": predictions, "reference": actuals})
        final_df.to_csv(os.path.join(current_model_results_dir, f"{model_name}-predictions-{epoch}.csv"))

        # Let's try to compute metrics
        console.log(f"[Try computing metrics]...\n")
        try:
          model = model.to("cpu") # Free gpu for metrics computes
          results = the_metrics.compute_metrics(inputs, predictions, actuals, 
                                                use_bertscore=True, 
                                                use_bleurt=True,
                                                verbose=True, 
                                                bleurt_batch_size=8,
                                                max_samples_bertscore=None,
                                                max_samples_bleurt=None)
          add_to_metrics_dict(results, metrics_dict)
          print_metrics_dict(metrics_dict)
          metrics_path = os.path.join(current_model_results_dir, f"{model_name}-metrics-{epoch}.json")
          with open(metrics_path, 'w') as metrics_json:
            json.dump(metrics_dict, metrics_json)
        except Exception as exc:
          console.log(f"Exception occured while computing metrics:\n{exc}\nContinue training...")
        finally:
          model = model.to(device)

        # Saving the model and staff after every epoch
        console.log(f"[Saving Model And Optimizer]...\n")
        checkpoint = { 
          'epoch': epoch,
          'model': model,
          'optimizer': optimizer,
          'metrics_dict': metrics_dict
          }
        torch.save(checkpoint, os.path.join(checkpoints_dir, f"{model_name}-checkpoint-{epoch}.pth"))

        console.save_text(os.path.join(current_model_results_dir, f"{model_name}-logs-{epoch}.txt"))
        model.save_pretrained(os.path.join(models_dir, f"{model_name}-model-{epoch})"))

## Training

In [19]:
USE_FLORA_ADAM_EPSILON = True

model_params = {
    "MODEL": "t5-small",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TRAIN_EPOCHS": 32,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "ADAM_EPSILON": 1e-8 if USE_FLORA_ADAM_EPSILON else 1e-6, # Standard value is 1-e6
    "MAX_SOURCE_TEXT_LENGTH": 310,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 310,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
    "WEIGHT_DECAY": 0.0 # Penaly parameter for high weights: weight_decay * ||w||^2
}

T5Trainer(
    train_dataset=train_df,
    val_dataset = eval_df, 
    source_text="sentence",
    target_text="paraphrase",
    model_params=model_params,
    output_dir=f"{THIS_DIR}/models/{THIS_MODEL_NAME}",
)

  0%|          | 0/12249 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 12249/12249 [00:09<00:00, 1292.77it/s]
100%|██████████| 3063/3063 [00:02<00:00, 1284.36it/s]


Start of Building the dataloader: tokenization
End of Building the dataloader: tokenization
Start of Building the dataloader: tokenization
End of Building the dataloader: tokenization


  0%|          | 0/1532 [00:00<?, ?it/s]

 70%|███████   | 1078/1532 [13:55<05:49,  1.30it/s]