This is the code I have written in order to implement the miniproject "Episodes' Title Generation" (ETG).

# Installation, import, global variables

In [None]:
!pip install evaluate > /dev/null
!pip install transformers > /dev/null
!pip install datasets > /dev/null
!pip install accelerate > /dev/null
!pip install rouge_score > /dev/null

In [None]:
from datasets import Dataset, load_dataset, load_metric
import math
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
import numpy as np
from numpy.linalg import norm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, \
                         DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, \
                         DataCollatorForLanguageModeling, DataCollatorWithPadding, \
                         GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import torch

import zipfile
import gdown
import os

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
already_trained = True

In [None]:
if not already_trained:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)

In [None]:
# Download dataset and models from Google Drive to use them locally

FILE_ID_dataset = "14StcfnmUjQOvXI9xZAM9BeEyuzW3F5Nq"
gdown.download(f"https://drive.google.com/uc?export=download&id={FILE_ID_dataset}", "./series.zip", quiet=False)

if already_trained:
  FILE_ID_distilGPT2 = "150D-EpoMKozIwSKiS4sNG7Z4Je8mxODO"
  FILE_ID_T5 = "1wtta7aRBen2vJQlInTtPAJEqi2TvDsIg"

  gdown.download(f"https://drive.google.com/uc?export=download&id={FILE_ID_distilGPT2}", "./model-distil-gpt2.zip", quiet=False)
  gdown.download(f"https://drive.google.com/uc?export=download&id={FILE_ID_T5}", "./model-t5-base.zip", quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=14StcfnmUjQOvXI9xZAM9BeEyuzW3F5Nq
To: /content/series.zip
100%|██████████| 916k/916k [00:00<00:00, 113MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=150D-EpoMKozIwSKiS4sNG7Z4Je8mxODO
To: /content/model-distil-gpt2.zip
100%|██████████| 305M/305M [00:01<00:00, 153MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1wtta7aRBen2vJQlInTtPAJEqi2TvDsIg
To: /content/model-t5-base.zip
100%|██████████| 2.45G/2.45G [00:36<00:00, 67.4MB/s]


'./model-t5-base.zip'

In [None]:
def extraction_from_zip(path_zip,path_unzip):
      if os.path.exists(path_zip):
          print(f"Extracting the archive {path_zip}...")
          with zipfile.ZipFile(path_zip, 'r') as zip_ref:
              zip_ref.extractall(path_unzip)
          print("Done.")
          os.remove(path_zip) 

extraction_from_zip("./series.zip","/content/")

if already_trained:
  extraction_from_zip("./model-distil-gpt2.zip","/content/")
  extraction_from_zip("./model-t5-base.zip","/content/model-t5-base")

Extracting the archive ./series.zip...
Done.
Extracting the archive ./model-distil-gpt2.zip...
Done.
Extracting the archive ./model-t5-base.zip...
Done.


In [None]:
dataset_path = "/content/series.csv" 

if already_trained:
  model_path_gpt = "/content/model-distil-gpt2" 
  model_path_t5 = "/content/model-t5-base" 

In [None]:
# Loading the tokenizer the GPT2 model will use, using the Huggingface Transformers class "GPT2Tokenizer"
gpt_tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [None]:
# Definition of special tokens 
bos = '<|endoftext|>'
eos = '<|EOS|>'
title_tkn = '<|title|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': '<|pad|>',
                       'sep_token': title_tkn}

# Add special tokens to the tokenizer
num_added_toks = gpt_tokenizer.add_special_tokens(special_tokens_dict)

# Add special tokens to the model using model configuration
config = AutoConfig.from_pretrained('distilgpt2', 
                                    bos_token_id=gpt_tokenizer.bos_token_id,
                                    eos_token_id=gpt_tokenizer.eos_token_id,
                                    pad_token_id=gpt_tokenizer.pad_token_id,
                                    sep_token_id=gpt_tokenizer.sep_token_id,
                                    output_hidden_states=False)

# Loading the model distil-GPT2 using the Huggingface Transformers class "GPT2LMHeadModel" and the custom configuration
gpt_model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)

# Model embedding resizing
gpt_model.resize_token_embeddings(len(gpt_tokenizer))

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [None]:
# Loading the tokenizer that the T5 model will use, using the Huggingface Transformers class "AutoTokenizer"
t5_tokenizer = AutoTokenizer.from_pretrained('t5-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# Loading the T5-base model using the Huggingface Transformers class "AutoModelForSeq2SeqLM"
t5_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base').to("cuda")

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Global variables for the training phase
batch_size = 8
num_epochs = 5
learning_rate = 1e-5
weight_decay = 0.001
log_every = 50
eval_every = 50
lr_scheduler_type = "linear"

# Parameter for the generation of new titles when using T5 model
max_gen_length = 128

# Global variable for the beam search
num_beams = 4

# Global variables for the Top-P, Top-K sampling
no_repeat_ngram_size = 2
repetition_penalty = 1.5
top_p=0.9
temperature=0.85
top_k=50

#Dataset

The data for dataset building has been collected using a custom-made scraper  which gets entries from Wikipedia. The result of the scraper is a csv file made of two columns: TITLE and PLOT. I have collected data from 30 American television sitcoms reaching a total of 4560 episodes thus 4560 entries.

## Maximum length computations (for model T5)

Concerning the T5 model, I need to know what the maximum length both for tokenized plots and tokenized titles will be, in order to pass the parameter max_length to the tokenizer, to compute the padding.

In [None]:
dataset = load_dataset("csv", data_files=dataset_path, sep='\t')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e15b065befe7a53d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e15b065befe7a53d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
plots = []
titles = []
for t in dataset['train']:
    titles.append(t["TITLE"])
    plots.append(t["PLOT"])

In [None]:
def maxLengthComputation(items_set):
    max_len = 0
    for i in items_set:
        input_ids = t5_tokenizer.encode(i)
        if len(input_ids) > max_len:
            max_len = len(input_ids)
    return max_len

MAX_SOURCE_LEN = maxLengthComputation(plots)
print(f"MAX_SOURCE_LEN: {MAX_SOURCE_LEN}")
MAX_TARGET_LEN = maxLengthComputation(titles)
print(f"MAX_TARGET_LEN: {MAX_TARGET_LEN}")

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


MAX_SOURCE_LEN: 629
MAX_TARGET_LEN: 26


## Dataset creation

Here I create datasets for training, validation and testing using *sklearn.model_selection.train_test_split* which splits into random train and test subsets. 

In [None]:
episodes_df = pd.read_csv(dataset_path, sep='\t', encoding = 'utf-8')

In [None]:
episodes_df

Unnamed: 0,TITLE,PLOT
0,Pilot,Russian waitress Paulina is fired from the din...
1,And the Rich People Problems,Max is for once overwhelmed and impressed when...
2,And the '90s Horse Party,After discovering how many debts Max is dodgin...
3,And the Break-up Scene,Caroline thinks she is doing Max a favor when ...
4,And Strokes of Goodwill,Max takes Caroline thrift shopping. After seei...
...,...,...
4555,Body Glitter and a Mall Safety Kit,"Paige's parents get a divorce, leading to Paig..."
4556,An Entrepreneurialist and a Swat on the Bottom,"Dr. Linkletter, wanting to create additional o..."
4557,"A Live Chicken, a Fried Chicken, and Holy Matr...",Pastor Jeff and Officer Robin rush to get marr...
4558,"A Solar Calculator, a Game Ball, and a Cheerle...",Sheldon uses sports analytics to help his scho...


In [None]:
episodes_df

Unnamed: 0,TITLE,PLOT
0,Pilot,Russian waitress Paulina is fired from the din...
1,And the Rich People Problems,Max is for once overwhelmed and impressed when...
2,And the '90s Horse Party,After discovering how many debts Max is dodgin...
3,And the Break-up Scene,Caroline thinks she is doing Max a favor when ...
4,And Strokes of Goodwill,Max takes Caroline thrift shopping. After seei...
...,...,...
4555,Body Glitter and a Mall Safety Kit,"Paige's parents get a divorce, leading to Paig..."
4556,An Entrepreneurialist and a Swat on the Bottom,"Dr. Linkletter, wanting to create additional o..."
4557,"A Live Chicken, a Fried Chicken, and Holy Matr...",Pastor Jeff and Officer Robin rush to get marr...
4558,"A Solar Calculator, a Game Ball, and a Cheerle...",Sheldon uses sports analytics to help his scho...


In [None]:
# Split the dataset
df_train, test_val = train_test_split(episodes_df, train_size = 0.9, random_state = 77)

df_val,df_test = train_test_split(test_val,test_size=0.5)

In [None]:
df_train

Unnamed: 0,TITLE,PLOT
3120,The Gothowitz Deviation,"Penny's bed in her apartment breaks, forcing h..."
42,And the Big Hole,After Han fires Caroline for disparaging the d...
234,How Oliver Got His Groove Back,With Oliver reluctant to get over Lindsey's di...
1343,I Get a Sidekick Out of You,Lane and Zach are getting married and they hav...
376,The Road Trip,When Jake and Amy have to stay at a B&B in ups...
...,...,...
1317,Jews and Chinese Food,"Still smarting from her split with Luke, Lorel..."
2283,Jerry's Painting,Feeling frustrated and powerless because of he...
2004,Flip Flop,"Gloria's ex-husband, Javiér introduces his new..."
3668,The Show Must Go On,After almost forgetting that Brick's middle-sc...


In [None]:
df_val

Unnamed: 0,TITLE,PLOT
186,Trust Me,After Taylor lies about adult supervision at a...
2499,My Number One Doctor,Dr. Kelso signs the hospital up to RateYourDoc...
1484,The Duel,When Lily decides to formally move into Marsha...
3458,It's Better to Have Loved and Lost It...,"Carlton meets Jo Ann, a beautiful woman on the..."
1546,Not a Father's Day,Lily and Marshall receive different views on w...
...,...,...
315,The Honeypot,"While trying to find a new assistant, Jake and..."
1944,Mother Tucker,Mitchell tries to tell Cameron how he feels un...
2385,Fastest Criminal in Reno,The Reno Sheriff's Department gets a chance to...
1551,Little Minnesota,"It's Christmas time and Ted's younger sister, ..."


In [None]:
df_test

Unnamed: 0,TITLE,PLOT
4089,Finale,"One year after the airing of the documentary, ..."
2414,Back in Black,Dangle (and an unwilling Williams) take time t...
2409,Dangle's Wedding (Part 1),Dangle's ex-wife's husband proposes to him. A ...
3844,How to Lose a Mom in Ten Days,Mindy is tired of Annette hovering around her ...
1135,The One with Two Parts,"Joey falls for Phoebe's identical twin sister,..."
...,...,...
185,Cheaters Sometimes Win,Katie's mother Kathryn arrives and immediately...
2036,Sick Days & Spelling Bees,Ned must avoid getting sick during flu season ...
478,Abnormal Psychology,Frasier reluctantly agrees to a talk-show deba...
2991,The Helium Insufficiency,Swedish physicists are about to prove Sheldon ...


### Dataset for GPT-2

Here I make the dataset compatible for the GPT model and thus I prepare the actual data that will be passed to the model.

In [None]:
# Thanks to the lamba function, I concatenate PLOT entry with TITLE entry using the separation token, for each row 
prepare_text = lambda x: ' '.join([bos, x['PLOT'], title_tkn, x['TITLE'], eos])

# I Introduce a new column 'text' for each set
df_train['text'] = df_train.apply(prepare_text, axis=1)
df_val['text'] = df_val.apply(prepare_text, axis=1)
df_test['text'] = df_test.apply(prepare_text, axis=1)

In [None]:
df_train

Unnamed: 0,TITLE,PLOT,text
3120,The Gothowitz Deviation,"Penny's bed in her apartment breaks, forcing h...",<|endoftext|> Penny's bed in her apartment bre...
42,And the Big Hole,After Han fires Caroline for disparaging the d...,<|endoftext|> After Han fires Caroline for dis...
234,How Oliver Got His Groove Back,With Oliver reluctant to get over Lindsey's di...,<|endoftext|> With Oliver reluctant to get ove...
1343,I Get a Sidekick Out of You,Lane and Zach are getting married and they hav...,<|endoftext|> Lane and Zach are getting marrie...
376,The Road Trip,When Jake and Amy have to stay at a B&B in ups...,<|endoftext|> When Jake and Amy have to stay a...
...,...,...,...
1317,Jews and Chinese Food,"Still smarting from her split with Luke, Lorel...",<|endoftext|> Still smarting from her split wi...
2283,Jerry's Painting,Feeling frustrated and powerless because of he...,<|endoftext|> Feeling frustrated and powerless...
2004,Flip Flop,"Gloria's ex-husband, Javiér introduces his new...","<|endoftext|> Gloria's ex-husband, Javiér intr..."
3668,The Show Must Go On,After almost forgetting that Brick's middle-sc...,<|endoftext|> After almost forgetting that Bri...


In [None]:
#https://huggingface.co/docs/datasets/v1.2.0/loading_datasets.html#from-a-pandas-dataframe
# As reported in HuggingFace site, a dataset.Dataset can be loaded from a pandas dataframe.
# I will consider only the column TEXT since I have already concatenated each plot with the corrisponding title.

train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])
test_dataset = Dataset.from_pandas(df_test[['text']])

In [None]:
#https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.padding
# With this function I tokenize each entry of the columun TEXT.
# As reported in HuggingFace, when the padding parameter of the tokenizer call is 
# set to True, the tokenizer pads to the longest sequence in the batch.
# Since I pass the entire columun, the tokenizer automatically will pad to the maximum length possible.

def tokenize_function(samples):
  return gpt_tokenizer(samples['text'], padding=True)

In [None]:
# Mapping each set with the function tokenize_function and removing the column 
# TEXT in which I no longer have an interest on, I obtain the final sets.

gpt_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text']
)

gpt_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text']
)

gpt_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text']
)

Map:   0%|          | 0/4104 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

In [None]:
gpt_train_dataset

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4104
})

### Dataset for T5

Here I make the dataset compatible for the T5 model and thus I prepare the actual data that will be passed to the model. Note that here I start from df_train, df_val and df_test which were previously modified by adding the 'text' column, which is not important for the purposes of preparation anyway.

In [None]:
#https://huggingface.co/docs/datasets/v1.2.0/loading_datasets.html#from-a-pandas-dataframe
# As reported in HuggingFace site, a dataset.Dataset can be loaded from a pandas dataframe.
# I will consider both the columns PLOT and TITLE of each set.

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
# https://huggingface.co/transformers/v3.1.0/model_doc/t5.html#training
# With this function I do the transformation to adjust the input data for the T5 model.
# For training, an input sequence and a target sequence are needed: in my case the input sequence is the tokenized plot 
# and the target sequence is the tokenized title. As reported in HuggingFace, the target sequence corresponds to the labels.

def preprocess_data(sample):
    
    model_inputs = t5_tokenizer(sample['PLOT'], max_length=MAX_SOURCE_LEN, padding=True, truncation=True)

    with t5_tokenizer.as_target_tokenizer():
        labels = t5_tokenizer(sample['TITLE'], max_length=MAX_TARGET_LEN, padding=True, truncation=True)

    # Replace all pad token ids in the labels with -100 to ignore padding in loss
    labels["input_ids"] = [
        [(l if l != t5_tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs['labels'] = labels["input_ids"]

    return model_inputs

In [None]:
# Mapping each set with the function tokenize_function and 
# removing the columns PLOT,TITLE and TEXT in which I no longer have an interest on, I obtain final sets.

t5_train_dataset = train_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=['PLOT', 'TITLE', 'text'],
    num_proc=1
)

t5_val_dataset = val_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=['PLOT', 'TITLE', 'text'],
    num_proc=1
)

t5_test_dataset = test_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=['PLOT', 'TITLE', 'text'],
    num_proc=1
)

Map:   0%|          | 0/4104 [00:00<?, ? examples/s]



Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

In [None]:
t5_train_dataset

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4104
})

### Useful lists

Here I create some lists that will be used later when new titles are generated in order to perform some checks.

In [None]:
train_titles = list(df_train["TITLE"])
  
val_titles = list(df_val["TITLE"])

test_titles = list(df_test["TITLE"])

test_plots_gpt = []
for p in df_test["PLOT"]:
    test_plots_gpt.append(bos+p+title_tkn)
test_plots_t5 = list(df_test["PLOT"])
test_plots = list(df_test["PLOT"])

# Training and evaluation

## GPT-2

Here's the code for the training of distil-GPT2 model. I use the HuggingFace Transformers API in order to get the data collator and trainer's objects.

In [None]:
# directly from https://huggingface.co/docs/evaluate/transformers_integrations#seq2seqtrainer

# Define ROGUE metrics on evaluation data
metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, gpt_tokenizer.pad_token_id)
    decoded_preds = gpt_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = gpt_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [None]:
#https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
# By instantiating the DataCollatorForLanguageModeling class, I get the object to form batches.

if not already_trained:
  data_collator_gpt = DataCollatorForLanguageModeling(tokenizer=gpt_tokenizer,mlm=False)

In [None]:
if not already_trained:
  model_path_gpt = "/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2"
  training_args_gpt = TrainingArguments(
      output_dir = model_path_gpt,          
      num_train_epochs = num_epochs,           
      per_device_train_batch_size = batch_size, 
      per_device_eval_batch_size = batch_size, 
      learning_rate=learning_rate,  
      warmup_steps = 200,               
      weight_decay = weight_decay,
      logging_dir = model_path_gpt,
      lr_scheduler_type = lr_scheduler_type,
      save_steps = 10000
  )

In [None]:
#https://huggingface.co/docs/transformers/main_classes/trainer
if not already_trained:
  trainer_gpt = Trainer(
    model=gpt_model,                         
    args=training_args_gpt,                  
    data_collator=data_collator_gpt,
    train_dataset=gpt_train_dataset,        
    eval_dataset=gpt_val_dataset,       
    compute_metrics=compute_metrics,
  )

In [None]:
if not already_trained:
  trainer_gpt.train()



Step,Training Loss
500,11.0768
1000,4.0092
1500,3.9112
2000,3.8668
2500,3.8387


TrainOutput(global_step=2565, training_loss=5.30222536202295, metrics={'train_runtime': 1990.5897, 'train_samples_per_second': 10.309, 'train_steps_per_second': 1.289, 'total_flos': 2725145467158528.0, 'train_loss': 5.30222536202295, 'epoch': 5.0})

In [None]:
if not already_trained:
  trainer_gpt.save_model()
  gpt_tokenizer.save_pretrained(model_path_gpt)

('/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2/vocab.json',
 '/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2/merges.txt',
 '/content/drive/MyDrive/NUANS/miniproject/model-distil-gpt2/added_tokens.json')

### Problems with GPU

Here I have reported both the attempt to calculate the metric during the training phase using the validation set and the attempt of the evaluation of the model just trained. Both the trials end with "CUDA out of memory" and thus I decide to exclude these steps.

#### Training

In [None]:
# training_args_gpt = TrainingArguments(
#     output_dir = model_path_gpt,          
#     num_train_epochs = num_epochs,           
#     per_device_train_batch_size = 5, 
#     per_device_eval_batch_size = 5,
#     evaluation_strategy ="steps",
#     eval_steps = 50,   
#     warmup_steps = 200,               
#     weight_decay = 0.01,
#     logging_dir = model_path_gpt,
#     save_steps = 10000
# )

# trainer_gpt = Trainer(
#     model=gpt_model,                         
#     args=training_args_gpt,                  
#     data_collator=data_collator_gpt,
#     train_dataset=gpt_train_dataset,        
#     eval_dataset=gpt_val_dataset,       
#     compute_metrics=compute_metrics,
# )

In [None]:
# trainer.train()

Step,Training Loss,Validation Loss


#### Evaluation

In [None]:
# trainer_gpt.evaluate(eval_dataset=gpt_test_dataset)

## T5

Here's the code for the training of T5 model. I use the HuggingFace Transformers API in order to get the data collator and the trainer's objects.

In [None]:
# directly from https://huggingface.co/docs/evaluate/transformers_integrations#seq2seqtrainer

# Define ROGUE metrics on evaluation data
metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
    decoded_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [None]:
if not already_trained:
  data_collator_t5 = DataCollatorForSeq2Seq(t5_tokenizer, model=t5_model)

In [None]:
if not already_trained:
  model_path_t5 = '/content/drive/MyDrive/NUANS/miniproject/model-t5-base'
  training_args_t5 = Seq2SeqTrainingArguments(
      output_dir=model_path_t5,
      evaluation_strategy="steps",
      eval_steps=eval_every,
      learning_rate=learning_rate,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=weight_decay,
      num_train_epochs=num_epochs,
      predict_with_generate=True,
      logging_steps=log_every,
      group_by_length=True,
      lr_scheduler_type=lr_scheduler_type,
      resume_from_checkpoint=True,
  )

In [None]:
if not already_trained:
  t5_trainer = Seq2SeqTrainer(
    t5_model,
    training_args_t5,
    train_dataset=t5_train_dataset,
    eval_dataset=t5_val_dataset,
    data_collator=data_collator_t5,
    tokenizer=t5_tokenizer,
    compute_metrics=compute_metrics,
  )

In [None]:
if not already_trained:
  t5_trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,4.5902,4.25247,0.098718,0.025833,0.097681,0.096961
100,4.2882,3.899623,0.116279,0.021254,0.115807,0.116566
150,4.077,3.770224,0.140052,0.02405,0.138746,0.138873
200,3.8374,3.681719,0.167929,0.030702,0.167288,0.167396
250,3.8594,3.622344,0.170478,0.03655,0.169934,0.170229
300,3.8893,3.573097,0.181129,0.039975,0.179932,0.17959
350,3.8983,3.53628,0.182918,0.039975,0.180712,0.181438
400,3.6794,3.506391,0.183681,0.037907,0.18187,0.182303
450,3.7005,3.48696,0.18356,0.03655,0.181257,0.182348
500,3.6537,3.460644,0.189701,0.04623,0.187469,0.18857


CPU times: user 58min 12s, sys: 12min 4s, total: 1h 10min 16s
Wall time: 1h 17min 53s


TrainOutput(global_step=2565, training_loss=3.537028021561472, metrics={'train_runtime': 4671.1529, 'train_samples_per_second': 4.393, 'train_steps_per_second': 0.549, 'total_flos': 1.300563736326144e+16, 'train_loss': 3.537028021561472, 'epoch': 5.0})

### Evaluation

In [None]:
if not already_trained:
  t5_trainer.evaluate(eval_dataset=t5_test_dataset)

CPU times: user 19.7 s, sys: 421 ms, total: 20.1 s
Wall time: 20.5 s


{'eval_loss': 3.1731691360473633,
 'eval_rouge1': 0.18764824356929616,
 'eval_rouge2': 0.06558235867446394,
 'eval_rougeL': 0.18803508869298338,
 'eval_rougeLsum': 0.18787361517624673,
 'eval_runtime': 20.5103,
 'eval_samples_per_second': 11.116,
 'eval_steps_per_second': 1.414,
 'epoch': 5.0}

In [None]:
if not already_trained:
  model_path_t5 = f"{model_path_t5}/checkpoint-2500"

# Generation of new titles

In [None]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)

## GPT-2

Since the generate method returns the input concatenated to the generated response, I need to set the max_length parameter corresponding to the length of the input plot + max_new_tokens. In this case I have decided that titles must have a maximum length equal to 6.

In [None]:
def title_generation(model, tokenizer, input_text, device, beam_search=True):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    if beam_search:
      new_title = model.generate(
        text_ids,
        max_length= text_ids.shape[1]+6,
        temperature=temperature, 
        num_beams=num_beams, 
        early_stopping=True
      )
      
    # The alternative to beam search is Top-K, Top-P sampling
    if not beam_search:
      new_title = model.generate(
          text_ids, 
          max_length = text_ids.shape[1]+6,  
          no_repeat_ngram_size = no_repeat_ngram_size,
          repetition_penalty = repetition_penalty,
          top_p = top_p,
          temperature = temperature,
          do_sample = True,
          top_k = top_k,
          early_stopping = True
      )

    title = tokenizer.decode(new_title[0], skip_special_tokens=True)
    return title

In [None]:
gpt_model = GPT2LMHeadModel.from_pretrained(model_path_gpt)
gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_path_gpt)

bos = gpt_tokenizer.bos_token
eos = gpt_tokenizer.eos_token
title_tkn = gpt_tokenizer.sep_token

In [None]:
gpt_titles = []

for plot in test_plots_gpt:

    title = title_generation(gpt_model, gpt_tokenizer, plot, device, beam_search=False)
    
    # Actually "title" is the input sequence concatenated to the generated title.
    # I need to separate the input sequence to focus on the title.
    plot_length = len(plot[len(bos):-len(title_tkn)])
    gpt_titles.append(title[plot_length:])



In [None]:
print(len(test_plots_gpt))
print(len(gpt_titles))

228
228


In [None]:
# Printing the first 20 samples
for plot, titles in zip(test_plots[:20],zip(test_titles[:20], gpt_titles[:20])):
    print(f"Plot: {plot}")
    print(f"Original title: {titles[0]}")
    print(f"Generated title: {titles[1]}")
    print() 

Plot: One year after the airing of the documentary, past and present employees of Dunder Mifflin gather for Dwight and Angela's wedding. Dwight initially chooses Jim to be his best man, but Michael Scott shows up and takes his place. Finally, everyone comes together for a final round of interviews, during which Erin reunites with her biological parents and everyone is brought to tears. 
Original title: Finale
Generated title: he Interview

Plot: Dangle (and an unwilling Williams) take time to connect with his black half-brother and half-sister from his father's other family in Chicago.
Original title: Back in Black
Generated title: The Great Race

Plot: Dangle's ex-wife's husband proposes to him. A naked Wiegel gets stuck in a giant cake and goes into labor. Dangle holds his wedding at the hospital when Garcia bursts in and declares his love for Dangle's fiancee. Wiegel tells Dangle that she knows who her child's father is.
Original title: Dangle's Wedding (Part 1)
Generated title: The

## T5

In [None]:
t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_path_t5).to(device)
t5_tokenizer = AutoTokenizer.from_pretrained(model_path_t5)

In [None]:
def title_generation(input,tokenizer,model,device,beam_search=True):
    text_ids = tokenizer([input], max_length=512, return_tensors='pt',padding=True, truncation=True)['input_ids']
    
    if beam_search:
        title_ids = model.generate(
        text_ids.to(device), 
        num_beams=num_beams, 
        temperature=temperature, 
        max_length=max_gen_length, 
        early_stopping=True
        )

    # The alternative to beam search is Top-K Top-p sampling
    if not beam_search:
        title_ids = model.generate(
            text_ids.to(device), 
            max_length = max_gen_length,  
            no_repeat_ngram_size = no_repeat_ngram_size,
            repetition_penalty = repetition_penalty,
            top_p = top_p,
            temperature = temperature,
            do_sample = True,
            top_k = top_k,
            early_stopping = True
        )

    title = tokenizer.decode(title_ids[0].tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return title

In [None]:
t5_titles = []

for plot in test_plots_t5:
    title = title_generation(plot,t5_tokenizer,t5_model,device,beam_search=False)
    t5_titles.append(title)

In [None]:
print(len(test_plots_t5))
print(len(t5_titles))

228
228


In [None]:
# Printing the first 20 samples
for plot, titles in zip(test_plots[:20],zip(test_titles[:20], t5_titles[:20])):
    print(f"Plot: {plot}")
    print(f"Original title: {titles[0]}")
    print(f"Generated title: {titles[1]}")
    print() 

Plot: One year after the airing of the documentary, past and present employees of Dunder Mifflin gather for Dwight and Angela's wedding. Dwight initially chooses Jim to be his best man, but Michael Scott shows up and takes his place. Finally, everyone comes together for a final round of interviews, during which Erin reunites with her biological parents and everyone is brought to tears. 
Original title: Finale
Generated title: My Wedding

Plot: Dangle (and an unwilling Williams) take time to connect with his black half-brother and half-sister from his father's other family in Chicago.
Original title: Back in Black
Generated title: The One with a Black Brother and the Other

Plot: Dangle's ex-wife's husband proposes to him. A naked Wiegel gets stuck in a giant cake and goes into labor. Dangle holds his wedding at the hospital when Garcia bursts in and declares his love for Dangle's fiancee. Wiegel tells Dangle that she knows who her child's father is.
Original title: Dangle's Wedding (Pa

# Metrics

Since the generated title won't be the same as the original one, maybe it's more convenient to use metrics which work in semantics rather than metrics which take into account the words but not the meanings behind them nor the context e.g. ROUGE. The generated title can be compared with the actual title from the point of view of context and catchiness, and with the plot from the point of view of context. Following this, there are the metrics which evaluate the two models looking at these considerations.

## Catchiness score

From the paper [TiZen: Neural Title Generation for Scientific Papers](https://harshiljain.in/pdf/TiZen_Paper.pdf) I have taken the metric to measure the catchiness of generated titles. Here in the paper, the authors affirm "*the basic intuition behind the definition of catchiness is that less
frequent or rare content words make a title catchy*" and thus the following formulae are from:
> $TC_G=-\frac{\sum_{i=1}^m \textbf{plot_count}[actual[i]]}{m}$

> $TC_P=-\frac{\sum_{i=1}^n \textbf{plot_count}[predicted[i]]}{n}$

> $CS=TC_G-TC_P$

where $\textbf{plot_count}$ contains the counts of words
in the given plot, $\textbf{actual[i]}$ represents the
i-th word in the actual title whereas the $\textbf{predicted[i]}$ represents
the i-th word in the generated title, $\textbf{m}$ is the number of words
in the actual title and $\textbf{n}$ is the number of words in the generated title, $TC_G$ is the Title Catchiness Score of the actual title, $TC_P$ is the Title Catchiness Score of generated title, $CS$ is the Catchiness Score. 

In [None]:
stopwords = [' ', '.', '-', '7', '5', ',', "'", ';', '\n', '"', '(',')', ':', '—',
           '&', '–', '$', '#', '’', '/', '?', '“', '”', '!', '[', ']', '−', '+',
           '_', '%', '|', '=', '}', '̈', '́', '̀', '̃', '…']

def title_catchiness(plot_count, title):
  title_words = [w.lower() for w in title.split(" ")]
  # Length of the title
  n=0 
  for w in title_words:
    if w not in stopwords:
      n+=1
  
  # Sum over the title words
  s = 0
  for w in title_words:
    if w not in stopwords:
      if w in plot_count.keys():
        s-=plot_count[w]
      else:
        # If the title word is not present in the plot, set its value to -10
        s-=10

  tc = s/n
  return tc

def catchiness_score(plot,original_title,generated_title):
  plot_words = [w.lower() for w in plot.split(" ")]
  plot_count = {}
  for w in plot_words:
    if w not in stopwords:
      if w in plot_count.keys():
        plot_count[w] += 1
      else:
        plot_count[w] = 1
  tc_g = title_catchiness(plot_count,original_title)
  tc_p = title_catchiness(plot_count,generated_title)
  return tc_g,tc_p
  

In [None]:
gpt_catchiness = 0

for p,ot,gt in zip(test_plots,test_titles,gpt_titles):
  tc_g,tc_p = catchiness_score(p,ot,gt)
  cs = tc_g-tc_p
  gpt_catchiness+=cs

In [None]:
t5_catchiness = 0

for p,ot,gt in zip(test_plots,test_titles,t5_titles):
  tc_g,tc_p = catchiness_score(p,ot,gt)
  cs = tc_g-tc_p
  t5_catchiness+=cs

In [None]:
print(f"The average catchiness score for GPT-2 is {gpt_catchiness/len(gpt_titles)}")
print(f"The average catchiness score for T5 is {t5_catchiness/len(t5_titles)}")

The average catchiness score for GPT-2 is 0.4921313700918962
The average catchiness score for T5 is -1.6447211779448612


## Cosine similarity

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
! wget -P /content/ http://nlp.stanford.edu/data/glove.6B.zip
! unzip /content/glove.6B.zip
! rm glove.6B.zip glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt  

--2023-06-09 07:29:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-06-09 07:29:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-06-09 07:29:24--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/content/glove.6B.z

In [None]:
# Load the GloVe word embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype=np.float32)
            embeddings[word] = embedding
    return embeddings

# Preprocess sentence by tokenizing and normalizing
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    return tokens

In [None]:
glove_embeddings = load_glove_embeddings('/content/glove.6B.300d.txt')

In [None]:
def compute_embeddings(sentence1, sentence2, embeddings):
  tokens1 = preprocess_sentence(sentence1)
  tokens2 = preprocess_sentence(sentence2)
  embeddings1 = [embeddings[token] for token in tokens1 if token in embeddings]
  embeddings2 = [embeddings[token] for token in tokens2 if token in embeddings]
  return embeddings1,embeddings2

In [None]:
def cos_similarity(embeddings1, embeddings2):
    if not embeddings1 or not embeddings2:
        # If any of the sentences does not have word embeddings, return a similarity score of 0
        return 0.0
    
    # Calculate cosine similarity between embeddings and compute the mean vector
    
    similarity = cosine_similarity(embeddings1, embeddings2).mean()
    return similarity

In [None]:
gpt_plot_title = 0

for p,gt in zip(test_plots,gpt_titles):
  embeddings1,embeddings2 = compute_embeddings(gt, p, glove_embeddings)
  similarity = cos_similarity(embeddings1, embeddings2)
  gpt_plot_title+=similarity

In [None]:
t5_plot_title = 0

for p,gt in zip(test_plots,t5_titles):
  embeddings1,embeddings2 = compute_embeddings(gt, p, glove_embeddings)
  similarity = cos_similarity(embeddings1, embeddings2)
  t5_plot_title+=similarity

In [None]:
gpt_title_title = 0

for ot,gt in zip(test_titles,gpt_titles):
  embeddings1,embeddings2 = compute_embeddings(ot, gt, glove_embeddings)
  similarity = cos_similarity(embeddings1, embeddings2)
  gpt_title_title+=similarity

In [None]:
t5_title_title = 0

for ot,gt in zip(test_titles,t5_titles):
  embeddings1,embeddings2 = compute_embeddings(ot, gt, glove_embeddings)
  similarity = cos_similarity(embeddings1, embeddings2)
  t5_title_title+=similarity

In [None]:
print(f"The average cosine similarity between plots and titles for GPT-2 is {gpt_plot_title/len(gpt_titles)}")
print(f"The average cosine similarity between plots and titles T5 is {t5_plot_title/len(t5_titles)}")

The average cosine similarity between plots and titles for GPT-2 is 0.27216831143749387
The average cosine similarity between plots and titles T5 is 0.2391716864638096


In [None]:
print(f"The average cosine similarity between titles and titles for GPT-2 is {gpt_title_title/len(gpt_titles)}")
print(f"The average cosine similarity between titles and titles for T5 is {t5_title_title/len(t5_titles)}")

The average cosine similarity between titles and titles for GPT-2 is 0.23295577256654373
The average cosine similarity between titles and titles for T5 is 0.22968915605145557


### Cosine similarity between plots and original titles

In [None]:
cs_plot_title = 0

for p,ot in zip(test_plots,test_titles):
  embeddings1,embeddings2 = compute_embeddings(ot, p, glove_embeddings)
  similarity = cos_similarity(embeddings1, embeddings2)
  cs_plot_title+=similarity

In [None]:
print(f"The average cosine similarity between plots and original titles is {cs_plot_title/len(test_titles)}")

The average cosine similarity between plots and original titles is 0.2160811990360615
