# Setup

## Import necessary libraries

In [1]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import selfutil
from classes import SpreadsheetDataLoader, TestRNN

# Now reload the modules to ensure they are up-to-date
importlib.reload(selfutil)
importlib.reload(SpreadsheetDataLoader)
importlib.reload(TestRNN)

# Import the funcs needed from utils
from utils.selfutil import get_vocabulary, create_embeddings, to_gpu

# Import the SpreadsheetDataLoader class
from classes.SpreadsheetDataLoader import SpreadsheetDataLoader
from classes.TestRNN import TestRNN

# Other regular imports
import torch.nn as nn
import torch
from tqdm import tqdm
import gc
import os
import pandas as pd
import math
import time

# Build Vocabulary

Get the vocabulary object from the helper function as well as the processed file paths.

In [2]:
# Set the directory containing the spreadsheets
data_dir = '../data/train_small/'

# Get the list of file paths
spreadsheet_vocab,file_paths = get_vocabulary(data_dir)

# Print info
print(f'\n\nVocabulary size: {len(spreadsheet_vocab._word2idx)}')
print(f'Files Processed: {len(file_paths)}')

Processing Files in Parallel:   0%|                      | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]



Vocabulary size: 10010
Files Processed: 10


# Word Embeddings

Create the vector representation for each word in the vocabulary using Glove that represents each word with a 50-dimensional vector else makes it a normally distributed random vector.

**NOTE**: You keep confusing word embeddings with words vectors, they are used interchangeably

In [3]:
# Create the embeddings for each word in the vocabulary and view info
spreadsheet_wvs = create_embeddings(spreadsheet_vocab)
print(f'Word Embeddings Shape: {spreadsheet_wvs.shape}')
print(f'\nExample Embedding for <unk> at index 0:\n{spreadsheet_wvs[0]}')


  0%|                                                 | 0/10010 [00:00<?, ?it/s][A
100%|██████████████████████████████████| 10010/10010 [00:00<00:00, 80004.77it/s][A

Word Embeddings Shape: torch.Size([10010, 50])

Example Embedding for <unk> at index 0:
tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
         0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473,
        -1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530,
         0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437,
        -0.6136,  0.0316,  1.0554,  0.1778, -0.2303, -0.3918,  0.5433, -0.3952,
         0.2055, -0.4503,  1.5210,  3.4105, -1.5312, -1.2341,  1.8197, -0.5515,
        -1.3253,  0.1886])





# Data Loader

DataLoader standardizes the data into uniform batches and we will be represent each spreadsheet as a 100x100x32 LongTensor because we standardize rows and columns as 100x100 and for each cell allow exactly 32 tokens. Therefore each index in x_tok list will be for a single spreadsheet.

In [4]:
# Create the SpreadsheetDataLoader object with the vocabulary and file paths and view
check_loader = SpreadsheetDataLoader(file_paths, spreadsheet_vocab)
print(f'Spreadsheets Processed: {len(check_loader)}')
print(f'x_tok Tensor Shape: {check_loader.x_tok[0].shape}')
print(f'y_tok Tensor Shape: {check_loader.y_tok[0].shape}')


Processing files: 100%|███████████████████████| 10/10 [00:00<00:00, 5799.65it/s][A


Spreadsheets Processed: 10
x_tok Tensor Shape: torch.Size([100, 100, 32])
y_tok Tensor Shape: torch.Size([100, 100, 17])


# Tester

In [5]:
# import torch
# import torch.nn as nn

# class TestRNN(nn.Module):

#     # Constructor of the RNN_LM class, initializing the layers and weights
#     def __init__(self, hidden_state_dim, rnn_layers, embedding_matrix, dropout_rate=0.05, nonlinearity='relu'):

#         # Ensures functions of parent class nn.Module are called in subclass RNN_LM
#         super(TestRNN, self).__init__()

#         # Rows of embed matrix = Each word in the vocabulary
#         self.vocab_size = embedding_matrix.shape[0]  # vocab_size = 34057

#         # Cols of embed matrix = Length of each embedding vector
#         self.embedding_dim = embedding_matrix.shape[1]  # embed_dim = 50

#         # The dimension of the hidden state vector 'h' for each step/token
#         self.hidden_dim = hidden_state_dim  # hid_dim = 100

#         # Number of recurrent layers we will use
#         self.rnn_layers = rnn_layers  # rnn_layers = 2

#         # Creates an embedding layer from the pre-trained embedding matrix that maps input tokens to their corresponding word vectors
#         # If freezing then embeddings don't change during training, we need False because we need them to finetune to our task
#         self._embed = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

#         # Randomly zeroes out a percentage of input units determined by dropout_rate for each update during training
#         self._drop = nn.Dropout(dropout_rate)

#         # RNN layer with 'relu' nonlinearity but not managing exploding gradients, dropout and multiple recurrent layers
#         self._rnn = nn.RNN(
#             self.embedding_dim,
#             self.hidden_dim,
#             self.rnn_layers,
#             nonlinearity=nonlinearity,
#             dropout=dropout_rate
#         )

#         # Linear layer to map the concatenated hidden states to logits (1 to predict bold or not)
#         self._pred = nn.Linear(2 * self.hidden_dim, 1)

#     def cell_hs(self, x):

#         torch.manual_seed(0)

#         """
#         Create a single CPU tensor to reduce GPU load to near zero and return H_global directly by calculating 
#         using this tensor and then casting it to the GPU device
#         """

#         H_local = torch.stack(
#             [
#                 self._rnn(
#                     self._drop(
#                         self._embed(
#                             x[:, cell // x.shape[2], cell % x.shape[2], :]  # batch_size x tokens = 10 x 32
#                         )  # batch_size x tokens x embed_dim = 10 x 32 x 50
#                     )
#                 )[1][-1, -1, :].view(-1)  # hidden_dim = 100
#                 for cell in range(x.shape[1] * x.shape[2])  # cells x hidden_dim = 10000 x 100
#             ]  # cells x hidden_dim = 10000 x 100
#         )  # cells x hidden_dim = 10000 x 100


#         # Calculate global sum directly and cast to GPU then return
#         return H_local.sum(dim=0, keepdim=True) - H_local  # cells x hidden_dim = 10000 x 100

#     def forward(self, x):
#         torch.manual_seed(0)

#         # Global hidden states containing info around current cell
#         H_global = self.cell_hs(x)  # Move H_global to GPU

#         S_cube = torch.zeros((x.shape[0], x.shape[1], x.shape[2]), device=x.device)

#         for cell in range(x.shape[1] * x.shape[2]):

#             # Compute the predictions for each cell using list comprehension
#             S_cube[:, cell // x.shape[2], cell % x.shape[2]] = self._pred(
#                 self._drop(
#                     torch.cat(
#                         (
#                             self._rnn(
#                                 self._drop(
#                                     self._embed(
#                                         x[:, cell // x.shape[2], cell % x.shape[2], :]
#                                     )
#                                 )
#                             )[0][:, -1, :].squeeze(1),
#                             H_global[cell].unsqueeze(0).expand(x.shape[0], -1)
#                         ),
#                         dim=1
#                     )
#                 )
#             ).view(-1)

#         # Clean up to free memory
#         del H_global

#         # Return final 3D tensor containing vocab tensors for a single batch
#         return S_cube

# #     # Function to calculate the last hidden state for each cell
# #     def init_hidden(self, batch_size):
# #         weight = next(self.parameters())
# #         return weight.new_zeros(self.rnn_layers, batch_size, self.hidden_dim).detach().cpu()


In [6]:
# # Create a DataLoader from your check_loader
# test_loader = torch.utils.data.DataLoader(check_loader, batch_size=5, shuffle=False)

# # Get one batch from the DataLoader
# batch = next(iter(test_loader))

# exfile = to_gpu(batch['x_tok'],2)

# # Define a new neural network model to be trained and transfer it to GPU
# hidden_state_dim = 100
# rnn_layers = 2
# rnn_model = to_gpu(TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs),2)

# out = rnn_model.forward(exfile)

# # Print the shape of S_cube
# print("S_cube shape:", out.shape)

# Training Function

In [7]:
def train_test(model, train_data, batch_size=8, lr=1.4e-5, mu=0.25, max_epochs=5, patience=2, save_int = 1, save_dir='../models/'):
    """
    Train the model for 1 batch, print the length of the train_loader, the training loss, and average training loss.
    """

    # Check if save_int > 0 and save_dir exists
    if save_int > 0 and not os.path.exists(save_dir):
        raise ValueError(f"Directory '{save_dir}' DNE")

    # Define the path where to save model and logfile
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    model_path = os.path.join(save_dir, f"rnnsmall_{timestamp}.pth")
    log_file = os.path.join(save_dir, f"rnnsmall_{timestamp}.txt")

    # Setup optimizer
    opt = torch.optim.Adagrad(model.parameters(), lr=lr)

    # Convert incoming training DataLoader into batches
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    # Calculate the class imbalance (ratio of non-bold to bold cells)
    num_bold_cells = sum((batch['y_tok'][:, :, :, 6] == 1).sum() for batch in train_loader)

    # Binary Cross-Entropy Loss with Logits
    loss_fn = nn.BCEWithLogitsLoss(
        pos_weight=to_gpu(
            torch.tensor(
                [((len(train_loader) * batch_size * 100 * 100)-num_bold_cells)/num_bold_cells], dtype=torch.float
            )
            ,2
        )
    )

    # Define the starting epoch
    epoch = 0

    # Define the best average training loss, perplexity as inf max value and epoch as 0
    best_avgtrloss = float('inf')
    best_perp = float('inf')
    best_epoch = 0
    

    # Epochs without improvement counter
    nimp_ctr = 0

    # Variable to denote training is on
    training = True
    lr_adjusted = False

    # Loop while model is in training mode and the epoch is less than max_epochs given
    while training and (epoch < max_epochs):


        # Print the epoch number and write to file also
        print(f'Epoch {epoch}')
        with open(log_file, 'a') as log:
            log.write(f"\nEpoch {epoch}\n")

        # Turn on training mode which enables dropout.
        model.train()

        # Initialize training loss
        curr_trloss = 0

        # Loop through the batches in train_loader
        for i, batch in enumerate(tqdm(train_loader, desc =f'Batch Processing')):

            

            # Clear any remaining gradients
            model.zero_grad()

            # Compute the loss
            loss = loss_fn(

                model(

                    to_gpu(

                        batch['x_tok'], 2

                    )

                ).view(-1), # Predicted labels from model

                to_gpu(

                    batch['y_tok'][:, :, :, 6], 2

                ).view(-1).float() # Actual labels from dataloader

            )

            # Accumulate the training loss
            curr_trloss += loss.detach().cpu().item()

            # Compute the gradients of the model parameters by backpropagating the loss
            loss.backward()

            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=mu)

            # Update the model parameters
            opt.step()

            # Clear memory
            del loss

        # Calculate average training loss for this epoch
        curr_avgtrloss = curr_trloss / len(train_loader)

        """
        Calculate the perplexity = e^(loss/(num_batches x batch_size x cells))
                                 = e^(loss/(78 x 8 x 10000)) = e^(loss/6240000)
        """
        curr_perp = math.exp(curr_trloss/(len(train_loader) * batch_size * 10000))

        # Print the average training loss, perplexity for this current epoch and write to file
        print(f'Train Loss: {curr_avgtrloss}, Perplexity: {curr_perp}')
        with open(log_file, 'a') as log:
            log.write(f'Train Loss: {curr_avgtrloss}, Perplexity: {curr_perp}\n')
        


        # Save the model and log if current epoch is a multiple of save_int
        if save_int > 0 and (epoch + 1) % save_int == 0:
            torch.save(model.state_dict(), model_path)
            print(f"Model Saved")
            with open(log_file, 'a') as log:
                log.write(f"Model Saved\n")

        # Check if current perplexity for this epoch is less than the best perplexity encountered till now
        if curr_perp < best_perp:

            # If it is then set the best loss and perplexity to current epoch's
            best_perp = curr_perp
            best_avgtrloss = curr_avgtrloss
            best_epoch = epoch

            # Also set epochs without improvement counter to 0 since we did see improvement
            nimp_ctr = 0

        # If it isn't the best then increment no improvement counter
        else:
            nimp_ctr +=1

        # Check if epochs without improvement have cross the patience threshold
        if nimp_ctr >= patience:

            # If they have then print early stopping message
            print(f"\nEARLY STOP Epoch {epoch}, Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
            with open(log_file, 'a') as log:
                log.write(f"\nEARLY STOP Epoch {epoch}, Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
            training = False


        # Increment the epoch and print a new line
        epoch += 1
        print()

    # Save model and log at the end of training (or early stopping)
    torch.save(model.state_dict(), model_path)

    # Print training complete message
    print(f"\nTRAINING DONE Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
    with open(log_file, 'a') as log:
        log.write(f"\nTRAINING DONE Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")

    # Return trained model at the end
    return model

Processing Files in Parallel:   0%|                      | 0/10 [00:20<?, ?it/s]


In [8]:
# Define a new neural network model to be trained and transfer it to GPU
hidden_state_dim = 100
rnn_layers = 3
test_model = TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs)

# # Load the state_dict from the saved file
# state_dict = torch.load('../models/rnnsmall_20240930_070804.pth')

# # Load the weights into the model
# test_model.load_state_dict(state_dict)

# Move the model to GPU device 2
trained_testmodel = to_gpu(test_model, 2)

# Call the function to train the model
trained_testmodel = train_test(
    test_model, check_loader,
    batch_size=5, lr=1.4e-5, mu=0.25, max_epochs=2, patience=1,
    save_int=5, save_dir='../models/'
)

Epoch 0


Batch Processing: 100%|███████████████████████████| 2/2 [01:03<00:00, 31.61s/it]


Train Loss: 328.30101013183594, Perplexity: 1.0065876237705587

Epoch 1


Batch Processing: 100%|███████████████████████████| 2/2 [01:01<00:00, 30.72s/it]

Train Loss: 313.88185119628906, Perplexity: 1.0062973826842998


TRAINING DONE Best: Epoch = 1, Train Loss = 313.88185119628906, Perplexity = 1.0062973826842998





# Inference

In [9]:
def infer(trained_model, infer_loader):
    """
    Takes a trained model and a dataloader, and returns a 100x100 2D grid of predictions
    (1 for bold, 0 for not bold) for the first spreadsheet in the dataloader.

    Args:
        trained_model (nn.Module): The trained PyTorch model.
        infer_loader (DataLoader): Dataloader object for the inference files.
        device (str): The device to run the inference on ('cuda' or 'cpu').

    Returns:
        torch.Tensor: A 100x100 tensor where each element is 1 (bold) or 0 (not bold).
    """

    # Move the model to eval mode
    trained_model.eval()

    # Get the first spreadsheet from the inference data loader
    x_infer = to_gpu(infer_loader.x_tok[0].unsqueeze(0),2)


    # Pass the input through the model and get predictions (no gradient needed for inference)
    with torch.no_grad():
        predictions = trained_model(x_infer)

    # Process the output, remove batch dimension, and apply sigmoid to get probabilities
    pred_grid = predictions.squeeze(0)  # Convert 1x100x100 -> 100x100
    
    # Set the print options to display more decimal places
    torch.set_printoptions(precision=20)

    # Print pred_grid
    print(pred_grid)
    pred_probs = torch.sigmoid(pred_grid)  # Convert logits to probabilities

    # Convert probabilities to binary (1 for bold, 0 for not bold), using 0.5 threshold
    pred_labels = (pred_probs > 0.5).long()

    # Return the 100x100 grid of prediction probabilities
    return pred_probs.detach().cpu(), infer_loader.y_tok[0][:, :, 6], infer_loader.file_paths[0]

In [10]:
# Define a new neural network model to be trained and transfer it to GPU
hidden_state_dim = 100
rnn_layers = 3
#test_model = TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs)

# Load the state_dict from the saved file
#state_dict = torch.load('../models/rnnsmall_20240930_065323.pth')

# Load the weights into the model
#test_model.load_state_dict(state_dict)

# Move the model to GPU device 2
#trained_testmodel = to_gpu(test_model, 2)

In [11]:
# Define the directory to be used for inference
infer_dir = '../data/infer_small/'

# List all files in the directory and append the full path
infer_files = [
    os.path.join(infer_dir, filename)
    for filename in os.listdir(infer_dir)
    if filename.lower().endswith(('.xls', '.xlsx', '.csv'))  # Adjust the file extensions as needed
]

# Define the dataloader for inference
infer_loader = SpreadsheetDataLoader(infer_files, spreadsheet_vocab)

Processing files: 100%|██████████████████████████| 1/1 [00:00<00:00, 989.92it/s]


In [12]:
pred_bold, act_bold, file = infer(trained_testmodel, infer_loader)
# Convert to pandas DataFrames for better display
pred_df = pd.DataFrame(pred_bold.numpy())  # Convert tensor to numpy, then to pandas DataFrame
actual_df = pd.DataFrame(act_bold.numpy())  # Convert tensor to numpy, then to pandas DataFrame

# Print the filename
print(f"\nFilename: {file}")

# Print the predictions
print("\nPredictions (1 = Bold, 0 = Not Bold):")
display(pred_df)

# Print the actual grid
print("\nActual Grid (1 = Bold, 0 = Not Bold):")
display(actual_df)

# Find the locations where the model predicted bold (1)
bold_pred_locations = pred_df[pred_df == 1].stack().index.tolist()

# Print the bold predictions
if bold_pred_locations:
    print(f"Bold predictions at the following row, col locations: {bold_pred_locations}")
else:
    print("No bold predictions were made by the model.")

tensor([[-208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000,  ...,
         -208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000],
        [-208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000,  ...,
         -208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000],
        [-208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000,  ...,
         -208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000],
        ...,
        [-208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000,  ...,
         -208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000],
        [-208.17042541503906250000, -208.17042541503906250000,
         -208.17042541503906250000,  ...,
         -208.17042541503906250000, -208.17042541503906250000,

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Actual Grid (1 = Bold, 0 = Not Bold):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


No bold predictions were made by the model.
