# Classification Task (Task 5)

## Load libraries

In [1]:
import numpy as np
import torch

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

import random

SEED = 595

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

! pip install transformers
! pip3 install datasets
! pip install evaluate
!pip install pytorch_lightning
!pip install sentencepiece

from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer,AutoTokenizer, BertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
import evaluate
from tqdm.auto import tqdm
import sentencepiece as spm
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import matplotlib.pyplot as plt

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Model, tokenizer

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

## Load Data and setup tokenizer

In [None]:
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
class TweetDataset(Dataset):
  def __init__(self, tokenizer, type_path,  max_len=512):
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.type_path = type_path
    self.inputs = []
    self.targets = []

    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  def _build(self):
      if self.type_path == "train":
        data =  dataset["train"]
      else:
        data = dataset["validation"]
      tweets = []
      results = []
      emotions = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"]
      for i, example in enumerate(data["ID"]):
        tweets.append([data["Tweet"][i]])
        tweet = data["Tweet"][i]
        for emotion in emotions:
          if data[emotion][i]:
            results.append(emotion)
      # tokenize inputs
        for i in range(len(results)):
          tokenized_inputs = self.tokenizer.batch_encode_plus(
            [tweet], max_length=40, return_tensors="pt",pad_to_max_length=True,#padding='longest'#
          )
          # tokenize targets, line[3]
          tokenized_targets = self.tokenizer.batch_encode_plus(
            [results[i]], max_length=40,  return_tensors="pt",pad_to_max_length=True,#padding='longest'#
          )
    #   print(target,self.tokenizer.decode(tokenized_targets["input_ids"][0]))#self.tokenizer.decode(tokenized_targets["input_ids"])
        self.inputs.append(tokenized_inputs)
        self.targets.append(tokenized_targets)
def get_dataset(tokenizer, type_path, args):
    return TweetDataset(tokenizer=tokenizer, type_path=type_path,  max_len=args.max_seq_length)


In [None]:
args_dict=dict(max_seq_length=40)
args = argparse.Namespace(**args_dict)
val_dataset = get_dataset(tokenizer=tokenizer, type_path="train", args=args)
data = val_dataset[1]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

## Training

In [None]:
args_dict = dict(
    output_dir="./", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=50,
    learning_rate=2e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=16,
    eval_batch_size=8,
    num_train_epochs=50,
    # gradient_accumulation_steps=4,
    n_gpu=1,
    # early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)
args = argparse.Namespace(**args_dict)

class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    print(hparams.model_name_or_path)
    self.hparamss = hparams
    self.validation_step_outputs = []
    self.training_step_outputs= []
#     self.automatic_optimization = False

    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

  def is_logger(self):
    return self.trainer.global_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None,labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    self.validation_step_outputs.append(loss)

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
#     self.lr_scheduler.step()
#     print("train loss",loss)
    self.training_step_outputs.append(loss)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def on_train_epoch_end(self):#, outputs
    avg_train_loss = torch.stack(self.validation_step_outputs).mean()
#     print(avg_train_loss)
    # avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}

    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
#     print("val loss ",loss)
    self.validation_step_outputs.append(loss)
    return {"val_loss": loss}

  def on_validation_epoch_end(self):#, outputs
    avg_loss = torch.stack(self.validation_step_outputs).mean()
#     avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}

    # self.log("validation_epoch_average", avg_loss)
#     self.validation_step_outputs.clear()  # free memory
#     self.validation_step_outputs=[]

    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparamss.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparamss.learning_rate, eps=self.hparamss.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
        optimizer.step()
        self.lr_scheduler.step()
  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   # if self.trainer.use_tpu:
  #   #   xm.optimizer_step(optimizer)
  #   # else:
  #   optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparamss)
    dataloader = DataLoader(train_dataset, batch_size=self.hparamss.train_batch_size, drop_last=True, shuffle=True, num_workers=2)
    t_total = (
        (len(dataloader.dataset) // (self.hparamss.train_batch_size * max(1, self.hparamss.n_gpu)))
        # // self.hparamss.gradient_accumulation_steps
        * float(self.hparamss.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparamss.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparamss)
    return DataLoader(val_dataset, batch_size=self.hparamss.eval_batch_size, num_workers=2)
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def __init__(self):
        self.collection = []

    def on_train_epoch_end(self, trainer, pl_module):
        # do something with all training_step outputs, for example:
        epoch_mean = torch.stack(pl_module.training_step_outputs).mean()
        print(epoch_mean)
        self.collection.append(epoch_mean.detach().cpu().item())
        pl_module.log("training_epoch_mean", epoch_mean)
        # free up the memory
        pl_module.training_step_outputs.clear()
#     def on_validation_end(self, trainer, pl_module):
#         logger.info("***** Validation results *****")
#         if pl_module.is_logger():
#             metrics = trainer.callback_metrics
#             # Log results
#             for key in sorted(metrics):
#                 if key not in ["log", "progress_bar"]:
#                     logger.info("{} = {}\n".format(key, str(metrics[key])))
    def on_train_end(self, trainer, pl_module):
        plt.plot(range(len(self.collection)),self.collection)
        print(self.collection)
        plt.savefig('t5_2.png')

#     def on_test_end(self, trainer, pl_module):
#         logger.info("***** Test results *****")

#         if pl_module.is_logger():
#           metrics = trainer.callback_metrics

#           # Log and save results to file
#           output_test_results_file = os.path.join(pl_module.hparamss.output_dir, "test_results.txt")
#           with open(output_test_results_file, "w") as writer:
#             for key in sorted(metrics):
#               if key not in ["log", "progress_bar"]:
#                 logger.info("{} = {}\n".format(key, str(metrics[key])))
#                 writer.write("{} = {}\n".format(key, str(metrics[key])))
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
     dirpath=args.output_dir,monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    # accumulate_grad_batches=args.gradient_accumulation_steps,
    # gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    # amp_level=args.opt_level,
    # gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)
model = T5FineTuner(args)
trainer = pl.Trainer(**train_params)
trainer.fit(model)

## Evaluation

In [None]:

dataset = TweetDataset(tokenizer, 'test', 40)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
it = iter(loader)
batch = next(it)
outs = model.model.generate(input_ids=batch['source_ids'].cuda(),
                              attention_mask=batch['source_mask'].cuda(),
                              max_length=40)

dec = [tokenizer.decode(ids) for ids in outs]

texts = [tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids) for ids in batch['target_ids']]

for i in range(32):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")

In [None]:
dataset = TweetDataset(tokenizer, 'test', 40)
loader = DataLoader(dataset, batch_size=32, num_workers=4)
model.model.eval()
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.generate(input_ids=batch['source_ids'].cuda(),
                              attention_mask=batch['source_mask'].cuda(),
                              max_length=40)

  dec = [tokenizer.decode(ids) for ids in outs]
  target = [tokenizer.decode(ids) for ids in batch["target_ids"]]

  outputs.extend(dec)
  targets.extend(target)
  metrics.accuracy_score(targets, outputs)
  print(metrics.classification_report(targets, outputs))