## Quantitative and Qualitative Evaluation of GPT2

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 56.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 48.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import T5Tokenizer, T5ForConditionalGeneration

import torch
import torch.nn as nn
import torch.nn.functional as F

from nltk.translate.bleu_score import sentence_bleu

from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

from tqdm.auto import tqdm

In [None]:
from google.colab import drive

import shutil

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
data_file_open = open('/content/data_f.txt', 'r', encoding='UTF-8')
label_file_open = open('/content/label_f.txt', 'r', encoding='UTF-8')
data_lines=data_file_open.readlines()
label_lines = label_file_open.readlines()

input_dataset = []
output_dataset = []
dataset = []


for item in data_lines:
    input_dataset.append(item.strip())
for item in label_lines:
    output_dataset.append(item.strip())

### Works for both GPT2 and GPT2-Medium

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

tokenizer.pad_token = tokenizer.eos_token

### Loading saved model checkpoints

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/ECE1786_Project/Models/GPT 2/gpt2_0frozen_8e-05lr_5epochs.pt'))

<All keys matched successfully>

In [None]:
model = model.to(device)

### Get Train & Validation Set

In [None]:
dataset = []

for i in range(len(input_dataset)):
    dataset.append(input_dataset[i].strip() + ' = ' + output_dataset[i].strip())

In [None]:
len(dataset)

42535

In [None]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=0)

In [None]:
_, train_sample = train_test_split(train_dataset, test_size=1000, random_state=0)
_, val_sample = train_test_split(val_dataset, test_size=1000, random_state=0)

In [None]:
print(train_sample[0])
print(val_sample[0])

What is my nation? = What ish my nation?
Go back again, you slave, and bring him home. = Go back again, thou slave, and fetch him home.


## Quantitative Evaluation using BLEU Score

### Calculate BLEU Score

In [None]:
from math import exp
from typing import List, Sequence, Iterable

In [None]:
#Extract all subsequences of length n

def grouper(seq, n):

    n_grams = []

    for j in range(len(seq) - n + 1):
        n_grams.append(seq[j:j+n])

    return n_grams

In [None]:
#Calculate the precision for a given order of n-gram

def n_gram_precision(reference, candidate, n):

    reference_ngrams = grouper(reference, n)
    candidate_ngrams = grouper(candidate, n)


    C = 0.0

    for n_gram in candidate_ngrams:
        if (n_gram in reference_ngrams):
            C = C+1.0
    
    N = len(candidate_ngrams)

    if (N==0):
      return 0

    return C/N

In [None]:
#Calculate the brevity penalty between a reference and candidate

def brevity_penalty(reference, candidate):

    if (len(candidate) == 0):
        return 0

    brevity = len(reference) / len(candidate)

    if(brevity < 1):
        return 1
    else:
        return (exp(1-brevity))

In [None]:
#Calculate the BLEU score for n gram

def BLEU_score(reference, candidate, n):

    precisions = []

    for i in range(n):
        precisions.append(n_gram_precision(reference, candidate, (i+1)))

    geometric_mean = 1.0

    for p in precisions:
        geometric_mean = geometric_mean * p

    geometric_mean = geometric_mean**(1.0/n)

    BP = brevity_penalty(reference, candidate)

    bleu = BP * geometric_mean

    return bleu

In [None]:
BLEU_score(['Hello','I','am','a','boy'] , ['Hello','I','am','a','bot'],4)

0.668740304976422

In [None]:
model = model.to(device)

### Evaluate using BLEU Score

In [None]:
def evaluate_BLEU(data):

    sample_size = len(data)

    avg_bleu = 0

    for i in range(sample_size):

        sample = data[i].split('=')
        input_text = sample[0].strip() + ' = '
        target_text = sample[1].strip()

        #print(sample)
        #print(input_text)
        #print(target_text)
        
        input = tokenizer.encode(input_text)
        target = tokenizer.encode(target_text)
        #print(len(target))
        max_new_tokens = len(target)+10

        encoded_output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=False, max_new_tokens=max_new_tokens)

        decoded_output_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)

        output_text = decoded_output_text.split('=')
        #print(output_text)
        output_text = output_text[1].strip()

        #print(output_text)

        target_split = target_text.split()
        output_split = output_text.split()

        n = 0

        if (len(target_split) < 2):
            n = 1
        elif (len(target_split) < 3):
            n = 2
        elif (len(target_split) < 4):
            n = 3
        else:
            n = 3
        
        bleu = BLEU_score(target_split, output_split, n)

        avg_bleu += bleu

        #print("Input:",input_text,"Target:",target_text,"Output:",output_text,"BLEU:",bleu)

    avg_bleu = avg_bleu/sample_size

    return avg_bleu

#### Training Dataset

In [None]:
train_bleu = evaluate_BLEU(train_sample)

In [None]:
print(train_bleu)

0.22678554024444345


#### Validation Dataset

In [None]:
val_bleu = evaluate_BLEU(val_sample)

In [None]:
val_bleu

0.17940444503692402

## Qualitative Evaluation

### Inputs from Train set

In [None]:
train_dataset[2500]

['At what time tomorrow should I send the messenger to you?',
 'What o’clock tomorrow Shall I send to thee?']

In [None]:
input_text="What I see and hear is like a dream!"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=30, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

What I see and hear is like a dream!
What I see and hear is like a dream! = As I see and hear in a dream! = as my wits bear, and hiss my tongue, a dreams, A dream  as


In [None]:
input_text="At what time tomorrow should I send the messenger to you?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=30, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

At what time tomorrow should I send the messenger to you?
At what time tomorrow should I send the messenger to you? = At what hour tomorrow should I send the herald to you? = Whence shall the herald come tomorrow? = What hour shall I send you tomorrow


In [None]:
input_text="Wake up the lively and swift spirit of fun. = "
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Wake up the lively and swift spirit of fun. = 
Wake up the lively and swift spirit of fun. = ! The lively and swift spirit Of fair amusement awakens! = Sleep, awake the spirit of fun. ! Sleep, awake, the spirit of jest! = Break, break, fall, lull! = Break, break, lull! Break


In [None]:
input_text="It may be the last time you do so."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

It may be the last time you do so.
It may be the last time you do so. = 'Twere done then. = 'Tis oft gone.  'Tis ne’er so.  'Possible the last time.  'Tis yet the hour.  'Possible the time.  'P


In [None]:
input_text="Why are you looking for me?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Why are you looking for me?
Why are you looking for me? = What, seek me for me? = Why are you looking for me? = What, seek I for me? = What search are you after of me? = What, seek I me? = Why art thou here thus watchful? =


### Inputs from Validation set

In [None]:
val_dataset[33]

["I'll only confine myself to these clothes I'm wearing.",
 'I’ll confine myself no finer than I am.']

In [None]:
input_text="I'll only confine myself to these clothes I'm wearing."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

I'll only confine myself to these clothes I'm wearing.
I'll only confine myself to these clothes I'm wearing. = I’ll confine myself to these garments. = My business is to this. = My business is to this. = I will but confine it me to these garments. = I shall be contented in these. = I w


In [None]:
input_text=" "
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

 
 ilius fides, come and welcome me here. = I prithee, marquis, come and welcome me hither.        LUCAS FIDES. = I prithee, marquis, come and


### Inputs from common English phrases

In [None]:
input_text="Hello, how are you?"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Hello, how are you?
Hello, how are you? = How now, how now!  O, how now!  O, how now! = How now, how now!  What, how now! O, how now!  O, how now!  O, how now! 


In [None]:
input_text="Do not cry over spilt milk."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Do not cry over spilt milk.
Do not cry over spilt milk. = Cry nought over crumbled milk. = Do not weep for slopp'd water. . .  . = Cry nought on splotched water. . . . .  To weep over spilt


In [None]:
input_text="Many congratulations to both of you!"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Many congratulations to both of you!
Many congratulations to both of you! = O happy king! = Very well to both! = Most happy king!    " ~ 'O happy king! '’s a happy day!   'Twere well to both!  'Twere well. 


In [None]:
input_text="I am hanging out with my dog."
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

I am hanging out with my dog.
I am hanging out with my dog. = I hang with my dog. = I wag with my dog. = I wag.    I'll hang with my dog. . .  . = I shall give away my life.  And I'll follow


### Surprising result: Inputting incomplete modern English phrase

In [None]:
input_text="Can you"
input = tokenizer.encode(input_text)
output = model.generate(input_ids=torch.tensor([input]).to(device), do_sample=True, max_new_tokens=50, temperature=0.9)
print(input_text)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Can you
Can you explain how you are so sure that I'm not a traitor? = Art thou convinced, Sir John, that I am not a traitor? = Canst thou tell how thou dost well be certain I am not traitor? = Art thou assured,
