## Quantitative and Qualitative Evaluation

In [None]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 19.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 29.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K   

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import T5Tokenizer, T5ForConditionalGeneration

import torch
import torch.nn as nn
import torch.nn.functional as F

from nltk.translate.bleu_score import sentence_bleu

from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

from tqdm.auto import tqdm

In [None]:
from google.colab import drive

import shutil

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
data_file_open = open('/content/data_f.txt', 'r', encoding='UTF-8')
label_file_open = open('/content/label_f.txt', 'r', encoding='UTF-8')
data_lines=data_file_open.readlines()
label_lines = label_file_open.readlines()

input_dataset = []
output_dataset = []
dataset = []


for item in data_lines:
    input_dataset.append(item.strip())
for item in label_lines:
    output_dataset.append(item.strip())

### Works for both T5 small and base

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base', pad_token_id=tokenizer.eos_token_id).to(device)

tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

### Loading checkpoints

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/ECE1786_Project/Models/T5_Base_Models/t5-base_0.0001lr_5epochs.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
model = model.to(device)

### Get Train & Validation Set

In [None]:
dataset = []

for i in range(len(input_dataset)):
    dataset.append([input_dataset[i], output_dataset[i]])

In [None]:
len(dataset)

42535

In [None]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=0) #gets the same train and validation dataset that the model was trained and evaluated on due to same random_state

In [None]:
_, train_sample = train_test_split(train_dataset, test_size=1000, random_state=0)
_, val_sample = train_test_split(val_dataset, test_size=1000, random_state=0)

## Quantitative Evaluation using BLEU Score

### Calculate BLEU Score

In [None]:
from math import exp
from typing import List, Sequence, Iterable

In [None]:
#Extract all subsequences of length n

def grouper(seq, n):

    n_grams = []

    for j in range(len(seq) - n + 1):
        n_grams.append(seq[j:j+n])

    return n_grams

In [None]:
#Calculate the precision for a given order of n-gram

def n_gram_precision(reference, candidate, n):

    reference_ngrams = grouper(reference, n)
    candidate_ngrams = grouper(candidate, n)


    C = 0.0

    for n_gram in candidate_ngrams:
        if (n_gram in reference_ngrams):
            C = C+1.0
    
    N = len(candidate_ngrams)

    if (N==0):
      return 0

    return C/N

In [None]:
#Calculate the brevity penalty between a reference and candidate

def brevity_penalty(reference, candidate):

    if (len(candidate) == 0):
        return 0

    brevity = len(reference) / len(candidate)

    if(brevity < 1):
        return 1
    else:
        return (exp(1-brevity))

In [None]:
#Calculate the BLEU score for n gram

def BLEU_score(reference, candidate, n):

    precisions = []

    for i in range(n):
        precisions.append(n_gram_precision(reference, candidate, (i+1)))

    geometric_mean = 1.0

    for p in precisions:
        geometric_mean = geometric_mean * p

    geometric_mean = geometric_mean**(1.0/n)

    BP = brevity_penalty(reference, candidate)

    bleu = BP * geometric_mean

    return bleu

In [None]:
BLEU_score(['Hello','I','am','a','boy'] , ['Hello','I','am','a','bot'],4)

0.668740304976422

### Evaluate using BLEU Score

In [None]:
def evaluate_BLEU(data):

    sample_size = len(data)

    avg_bleu = 0

    for i in range(sample_size):

        input_text = data[i][0].strip()
        input = tokenizer.encode(input_text)
        target_text = data[i][1].strip()

        encoded_output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=False, max_new_tokens=100)

        output_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)

        #print(output_text)

        target_split = target_text.split()
        output_split = output_text.split()

        n = 0

        if (len(target_split) < 2):
            n = 1
        elif (len(target_split) < 3):
            n = 2
        elif (len(target_split) < 4):
            n = 3
        else:
            n = 3
        
        bleu = BLEU_score(target_split, output_split, n)

        avg_bleu += bleu

        #print("Input:",input_text,"Target:",target_text,"Output:",output_text,"BLEU:",bleu)

    avg_bleu = avg_bleu/sample_size

    return avg_bleu

#### Training Dataset

In [None]:
train_bleu = evaluate_BLEU(train_sample)

In [None]:
print(train_bleu)

0.2801343655550924


#### Validation Dataset

In [None]:
val_bleu = evaluate_BLEU(val_sample)

In [None]:
val_bleu

0.20825181909802717

## Qualitative Evaluation

### Inputs from Training set

In [None]:
train_dataset[2500]

['At what time tomorrow should I send the messenger to you?',
 'What o’clock tomorrow Shall I send to thee?']

In [None]:
input_text="What I see and hear is like a dream!"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=30, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

What I see and hear is like a dream!
The dream is like, what I see and hear!


In [None]:
input_text="At what time tomorrow should I send the messenger to you?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=30, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

At what time tomorrow should I send the messenger to you?
What time of day should I send the messenger to thee?


In [None]:
input_text="Wake up the lively and swift spirit of fun."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=False, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Wake up the lively and swift spirit of fun.
Awake the lively and swift spirit of merry.


In [None]:
input_text="It may be the last time you do so."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

It may be the last time you do so.
'Tis your last and last.


In [None]:
input_text="Why are you looking for me?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Why are you looking for me?
How should you seek me?


### Inputs from Validation set

In [None]:
val_dataset[33]

["I'll only confine myself to these clothes I'm wearing.",
 'I’ll confine myself no finer than I am.']

In [None]:
input_text="I'll only confine myself to these clothes I'm wearing."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

I'll only confine myself to these clothes I'm wearing.
I’ll but be confined to these garments.


In [None]:
input_text="Can you hear me, forest dweller?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Can you hear me, forest dweller?
Dare you hear me, forest dweller?


### Inputs from common English phrases

In [None]:
input_text="Hello, how are you?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Hello, how are you?
How is ’t with you?


In [None]:
input_text="Do not cry over spilt milk."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Do not cry over spilt milk.
Weep not for spigot milk.


In [None]:
input_text="Many congratulations to both of you!"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Many congratulations to both of you!
Congratulations be to you both!


In [None]:
input_text="I am hanging out with my dog."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

I am hanging out with my dog.
With my dog I am out.


In [None]:
input_text="I am doing well, thanks for asking."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

I am doing well, thanks for asking. How about you?
I do well, 'cause I thank you.
