# Lab

##### Objective : The main purpose behind this lab is to get familiar with NLP language models using Pytorch library.

## Part 2 : Transformer (Text generation)

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

### Fine Tuning

For fine-tuning the GPT2 model, I will use this Quotes dataset : https://www.kaggle.com/datasets/alihasnainch/quotes-dataset

In [18]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class QuotesDataset(Dataset):
    def __init__(self, quotes_dataset_path):
        super().__init__()

        self.quotes_list = []
        self.end_of_text_token = tokenizer.eos_token  # Use the tokenizer's end of text token
        
        with open(quotes_dataset_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            next(csv_reader)  # Skip header row
            for row in csv_reader:
                quote_str = f"QUOTE: {row[1]}{self.end_of_text_token}"
                self.quotes_list.append(quote_str)
        
    def __len__(self):
        return len(self.quotes_list)

    def __getitem__(self, item):
        return self.quotes_list[item]

quotes_dataset_path = 'Quotes_Dataset.csv'
dataset = QuotesDataset(quotes_dataset_path)
quote_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [19]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [20]:
# Prepare the model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

# Initialize variables for training
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0
tmp_quotes_tens = None

# Create directory to save models
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

# Training loop
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx, quote in enumerate(quote_loader):
        
        # Encode the quote and move it to the device
        quote_tens = torch.tensor(tokenizer.encode(quote[0])).unsqueeze(0).to(device)
        
        # Skip sample if it is longer than MAX_SEQ_LEN
        if quote_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        # First quote sequence in the batch
        if not torch.is_tensor(tmp_quotes_tens):
            tmp_quotes_tens = quote_tens
            continue
        else:
            # If the next quote doesn't fit, process the current sequence
            if tmp_quotes_tens.size()[1] + quote_tens.size()[1] > MAX_SEQ_LEN:
                work_quotes_tens = tmp_quotes_tens
                tmp_quotes_tens = quote_tens
            else:
                # Add the quote to the current sequence
                tmp_quotes_tens = torch.cat([tmp_quotes_tens, quote_tens[:, 1:]], dim=1)
                continue
        
        # Forward pass through the model
        outputs = model(work_quotes_tens, labels=work_quotes_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss += loss.detach().data
        
        proc_seq_count += 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Save the model after each epoch
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_quotes_{epoch}.pt"))



In [None]:
import os

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob)  # Normalize
    choice = np.random.choice(n, 1, p=top_prob)
    token_id = ind[choice][0]
    return int(token_id)

# Load the model
MODEL_EPOCH = 4
models_folder = "trained_models"
model_path = os.path.join(models_folder, f"gpt2_medium_quotes_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))
model.eval()

# File path for generated quotes
quotes_output_file_path = f'generated_{MODEL_EPOCH}.quotes'

# Remove the output file if it already exists
if os.path.exists(quotes_output_file_path):
    os.remove(quotes_output_file_path)

# Generate quotes
with torch.no_grad():
    for quote_idx in range(1000):
        quote_finished = False

        # Start with the "QUOTE:" prompt
        cur_ids = torch.tensor(tokenizer.encode("QUOTE:")).unsqueeze(0).to(device)

        for i in range(100):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0, -1], dim=0)  # Take the last predicted token
            
            if i < 3:
                n = 20
            else:
                n = 3

            # Randomly select the next word from the top n probabilities
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(device) * next_token_id], dim=1)  # Add the last word to the sequence

            # Check if the generated token is the end-of-text token
            if next_token_id in tokenizer.encode(tokenizer.eos_token):
                quote_finished = True
                break

        if quote_finished:
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)

            with open(quotes_output_file_path, 'a') as f:
                f.write(f"{output_text} \n\n")

print("Quote generation complete.")


