In [1]:
# This is developed from the unfinished code for 2023 submission 

In [19]:
%pip install scikit-learn

import os,sys,re
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')   # from nltk documentation in case it is not already downloaded
import wandb

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charlesciumei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Device configuration. Use Apple Silicon or AMD GPU if available
# from https://developer.apple.com/metal/pytorch/

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [78]:
# This builds a vocabulary. Text preprocessing handled in dataset class below 

class Vocab:
    def __init__(self, filepath, show_vocab=False):
        self.filepath = filepath
        self.word_to_idx = {"<PAD>": 0, "<UNK>": 1} # Reserve 0 and 1 indices for padding and unknown tokens
        self.idx_to_word = {0: "<PAD>", 1: "<UNK>"}
        self.show_vocab = show_vocab # boolean to print vocab for testing
        self._build_vocab()
        
    def _build_vocab(self):
        with open(self.filepath, 'r') as f:
            text = f.read().split()

        # Build the vocabulary, use sorted to ensure the same order
        unique_words = sorted(set(text))
        # print(unique_words) # used to test vocab contents
        for word in unique_words:
            if word not in self.word_to_idx:
                idx = len(self.word_to_idx)
                self.word_to_idx[word] = idx
                self.idx_to_word[idx] = word
        print('Vocabulary size: ', len(self.word_to_idx))
        if self.show_vocab:
            print(self.tokenised_list)
        else:
            pass

    def tokenize(self, text):
        # Split the text into words and convert to indices
        return [self.word_to_idx.get(word, self.word_to_idx["<UNK>"]) for word in text.split()]

    def detokenize(self, indices):
        # Convert indices to words and join them into a text
        return " ".join(self.idx_to_word.get(ix, "<UNK>") for ix in indices)

    def __len__(self):
        return len(self.word_to_idx)

# This loads and creates the dataset for training and testing and builds the vocabulary using Vocab class
# Code and use of nltk toolkit from 2023 Lab3 code and nltk documentation
# Dataset class use from Pytorch examples

class Textdataset(Dataset):
    def __init__(self, filepath, max_sequence_length=25, show_vocab=False):
        # self.start_token = "<BOS>"
        # self.end_token = "<EOS>"
        # self.text_tokenised = self._pre_process_text()
        self.filepath = filepath
        self.show_vocab = show_vocab
        self.vocab = Vocab(filepath)
        self.data = []
        self.max_sequence_length = max_sequence_length 
        self.sequence_builder()

    # Build the source and target sequences by iterating over tokenized text
    # length-1 otherwise last sequence is too short
    def sequence_builder(self):
        with open (self.filepath, 'r', encoding="utf-8") as f:
            text = self.vocab.tokenize(f.read())
        
        # make the sequences
        for i in range(len(text)-self.max_sequence_length-1): 
            source_sequence = text[i:i+self.max_sequence_length]
            # target=source shifted by 1
            target_sequence = text[i+1:i+self.max_sequence_length+1]
            self.data.append((source_sequence, target_sequence)) 

    def collate_fn(self, batch):
        # Pad the source sequences to the same length
        source_sequences, target_sequences = zip(*batch)
        source_sequences = pad_sequence(source_sequences, batch_first=True, padding_value=self.vocab.word_to_idx["<PAD>"])
        target_sequences = torch.stack(target_sequences, dim=0)
        return source_sequences, target_sequences
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Return the item at the specified index
        # Returns a source_sequence and a target sequence pair
        source_sequence, target_sequence = self.data[idx]
        source_sequence = torch.tensor(source_sequence, dtype=torch.long)
        target_sequence = torch.tensor(target_sequence, dtype=torch.long)
        return source_sequence, target_sequence

    

In [79]:
# Test to show that the code above works to create dataset and a vocab.
# Uses print statement in Vocab class now commented out
# The test data is the first chapter of the the novel

dataset1 = Textdataset("dataset/Can_You_Forgive_Her_Ch1.txt", show_vocab=True)

Vocabulary size:  1019


In [80]:
dataset1[0]

(tensor([  85,   35,   17,   35,   59,   86,  140,   32,   23,   95,  697,  661,
          825,  979, 1014,  153,  921,  427,  522, 1014,  226,  313,  697,  313,
          669]),
 tensor([  35,   17,   35,   59,   86,  140,   32,   23,   95,  697,  661,  825,
          979, 1014,  153,  921,  427,  522, 1014,  226,  313,  697,  313,  669,
          192]))

In [54]:
# Construct model

class LSTMmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTMmodel, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

        self.vocab_size = vocab_size

    def forward(self, x):
        hidden = self.init_hidden(x.size(0))
        # Get word embeddings
        embedding_out = self.embedding(x)
        lstm_out, hidden = self.lstm(embedding_out, hidden)
        # Get predictions
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(lstm_out)
        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden state with zero weights, and move to GPU if available
        weight = next(self.parameters()).data

        if torch.cuda.is_available():
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

In [51]:
class Logger:
    # define a logger for Wandb - taken from Lab 4 2024
    def __init__(self, project='CC_INM706_1'):
        logger = wandb.init(project=project)
        self.logger = logger
        return

    def get_logger(self):
        return self.logger


In [90]:
# train loop

def train(model, train_dataset, eval_dataset, epochs, batch_size, learning_rate):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    wandb_logger = Logger()
    logger = wandb_logger.get_logger()

    train_losses, train_perplexities, eval_losses, eval_perplexities = [], [], [], []
    for epoch in range(epochs):
        total_loss = 0.0
        total_tokens = 0

        model.train()
        for inputs, targets in tqdm(train_loader, 'Training', leave=False):
            # Move data to the proper device
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs, _ = model(inputs)
            targets = targets.view(-1)  # Reshape targets
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Keep track of total loss and tokens
            total_loss += loss.item() * targets.shape[0]
            total_tokens += targets.shape[0]

        train_loss = total_loss / total_tokens
        train_perplexity = np.exp(train_loss)
        train_losses.append(train_loss)
        train_perplexities.append(train_perplexity)

        # Evaluate after each epoch
        eval_loss, eval_perplexity = evaluate(model, eval_loader)
        eval_losses.append(eval_loss)
        eval_perplexities.append(eval_perplexity)

        print(f'Train Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Perplexity: {train_perplexity:.2f}, Eval Loss: {eval_loss:.4f}, Eval Perplexity: {eval_perplexity:.2f}')

        # set up custom wandb log for eval perplexity based on code in wandb documentation
    
        # Set up data to log eval perplexity at each epoch
        if epoch == 0:
            plot_data = [] # to prevent resetting data to zero on every loop
        else:
            plot_data.append([epoch, eval_perplexity])
        
        # Create a table with the columns to plot
        table = wandb.Table(data=plot_data, columns=["epoch", "eval_p"])

        # Use the table to populate various custom charts
        line_plot = wandb.plot.line(table, x='epoch', y='eval_p', title='Line Plot')
  
        # Log custom tables, which will show up in customizable charts in the UI
        wandb.log({'line_1': line_plot,})

        # set up custom wandb log for training loss perplexity based on code in wandb documentation
    
        # Set up data to log eval loss at each epoch
        if epoch == 0:
            loss_data = [] # to prevent resetting data to zero on every loop
        else:
            loss_data.append([epoch, train_loss])
        
        # Create a table with the columns to plot
        loss_table = wandb.Table(data=loss_data, columns=["epoch", "loss"])

        # Use the table to populate various custom charts
        loss_plot = wandb.plot.line(loss_table, x='epoch', y='loss', title='Line Plot')
  
        # Log custom tables, which will show up in customizable charts in the UI
        wandb.log({'line_1': loss_plot,})


    # Save the model
    torch.save(model.state_dict(), 'LSTMmodel.pth')

# evaluate model to calculate perplexity of generated text
def evaluate(model, dataloader):
    criterion = nn.CrossEntropyLoss()

    model.eval()
    total_loss = 0.0
    total_tokens = 0


    with torch.no_grad():
        
        for inputs, targets in tqdm(dataloader, desc='Evaluating', leave=False):
            # Move data to the proper device
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs, _ = model(inputs)
            targets = targets.view(-1)  # Reshape targets
            loss = criterion(outputs, targets)

            # Keep track of total loss and tokens
            total_loss += loss.item() * targets.shape[0]
            total_tokens += targets.shape[0]


    eval_loss = total_loss / total_tokens
    eval_perplexity = np.exp(eval_loss)
    print(f'Eval Loss: {eval_loss:.4f}, Eval Perplexity: {eval_perplexity:.2f}')
    return eval_loss, eval_perplexity



In [92]:
# Running the model
dataset = Textdataset('dataset/Can_You_Forgive_Her_Chs_1_3.txt', max_sequence_length=25)
train_indices, eval_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_dataset = Subset(dataset, train_indices)
eval_dataset = Subset(dataset, eval_indices)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(dataset.vocab.word_to_idx)
embed_size = 128
hidden_size = 256
n_layers = 2

model = LSTMmodel(vocab_size, embed_size, hidden_size, n_layers)  #.to(device)
train(model, train_dataset, eval_dataset, epochs=25 , batch_size=32, learning_rate=0.001)

# Not where to put code to end wandb logging. From wandb documentation for use in Jupyter notebooks.
# run.finish()


Vocabulary size:  2845


                                                                                

Eval Loss: 5.6268, Eval Perplexity: 277.77
Train Epoch 1/25, Loss: 6.2966, Perplexity: 542.74, Eval Loss: 5.6268, Eval Perplexity: 277.77


                                                                                

Eval Loss: 4.3628, Eval Perplexity: 78.47
Train Epoch 2/25, Loss: 4.9731, Perplexity: 144.47, Eval Loss: 4.3628, Eval Perplexity: 78.47


                                                                                

Eval Loss: 3.2038, Eval Perplexity: 24.63
Train Epoch 3/25, Loss: 3.7423, Perplexity: 42.19, Eval Loss: 3.2038, Eval Perplexity: 24.63


                                                                                

Eval Loss: 2.2352, Eval Perplexity: 9.35
Train Epoch 4/25, Loss: 2.6390, Perplexity: 14.00, Eval Loss: 2.2352, Eval Perplexity: 9.35


                                                                                

Eval Loss: 1.5114, Eval Perplexity: 4.53
Train Epoch 5/25, Loss: 1.7728, Perplexity: 5.89, Eval Loss: 1.5114, Eval Perplexity: 4.53


                                                                                

Eval Loss: 1.0293, Eval Perplexity: 2.80
Train Epoch 6/25, Loss: 1.1655, Perplexity: 3.21, Eval Loss: 1.0293, Eval Perplexity: 2.80


                                                                                

Eval Loss: 0.7277, Eval Perplexity: 2.07
Train Epoch 7/25, Loss: 0.7737, Perplexity: 2.17, Eval Loss: 0.7277, Eval Perplexity: 2.07


                                                                                

Eval Loss: 0.5528, Eval Perplexity: 1.74
Train Epoch 8/25, Loss: 0.5397, Perplexity: 1.72, Eval Loss: 0.5528, Eval Perplexity: 1.74


                                                                                

Eval Loss: 0.4628, Eval Perplexity: 1.59
Train Epoch 9/25, Loss: 0.4068, Perplexity: 1.50, Eval Loss: 0.4628, Eval Perplexity: 1.59


                                                                                

Eval Loss: 0.4122, Eval Perplexity: 1.51
Train Epoch 10/25, Loss: 0.3296, Perplexity: 1.39, Eval Loss: 0.4122, Eval Perplexity: 1.51


                                                                                

Eval Loss: 0.3808, Eval Perplexity: 1.46
Train Epoch 11/25, Loss: 0.2823, Perplexity: 1.33, Eval Loss: 0.3808, Eval Perplexity: 1.46


                                                                                

Eval Loss: 0.3616, Eval Perplexity: 1.44
Train Epoch 12/25, Loss: 0.2510, Perplexity: 1.29, Eval Loss: 0.3616, Eval Perplexity: 1.44


                                                                                

Eval Loss: 0.3512, Eval Perplexity: 1.42
Train Epoch 13/25, Loss: 0.2297, Perplexity: 1.26, Eval Loss: 0.3512, Eval Perplexity: 1.42


                                                                                

Eval Loss: 0.3425, Eval Perplexity: 1.41
Train Epoch 14/25, Loss: 0.2131, Perplexity: 1.24, Eval Loss: 0.3425, Eval Perplexity: 1.41


                                                                                

Eval Loss: 0.3380, Eval Perplexity: 1.40
Train Epoch 15/25, Loss: 0.2006, Perplexity: 1.22, Eval Loss: 0.3380, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3347, Eval Perplexity: 1.40
Train Epoch 16/25, Loss: 0.1911, Perplexity: 1.21, Eval Loss: 0.3347, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3341, Eval Perplexity: 1.40
Train Epoch 17/25, Loss: 0.1831, Perplexity: 1.20, Eval Loss: 0.3341, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3320, Eval Perplexity: 1.39
Train Epoch 18/25, Loss: 0.1771, Perplexity: 1.19, Eval Loss: 0.3320, Eval Perplexity: 1.39


                                                                                

Eval Loss: 0.3318, Eval Perplexity: 1.39
Train Epoch 19/25, Loss: 0.1722, Perplexity: 1.19, Eval Loss: 0.3318, Eval Perplexity: 1.39


                                                                                

Eval Loss: 0.3332, Eval Perplexity: 1.40
Train Epoch 20/25, Loss: 0.1681, Perplexity: 1.18, Eval Loss: 0.3332, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3333, Eval Perplexity: 1.40
Train Epoch 21/25, Loss: 0.1647, Perplexity: 1.18, Eval Loss: 0.3333, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3348, Eval Perplexity: 1.40
Train Epoch 22/25, Loss: 0.1615, Perplexity: 1.18, Eval Loss: 0.3348, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3382, Eval Perplexity: 1.40
Train Epoch 23/25, Loss: 0.1589, Perplexity: 1.17, Eval Loss: 0.3382, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3371, Eval Perplexity: 1.40
Train Epoch 24/25, Loss: 0.1569, Perplexity: 1.17, Eval Loss: 0.3371, Eval Perplexity: 1.40


                                                                                

Eval Loss: 0.3367, Eval Perplexity: 1.40
Train Epoch 25/25, Loss: 0.1550, Perplexity: 1.17, Eval Loss: 0.3367, Eval Perplexity: 1.40
