# Setup

## Import necessary libraries

In [1]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import selfutil
from classes import SpreadsheetDataLoader, TestRNN

# Now reload the modules to ensure they are up-to-date
importlib.reload(selfutil)
importlib.reload(SpreadsheetDataLoader)
importlib.reload(TestRNN)

# Import the funcs needed from utils
from utils.selfutil import get_vocabulary, create_embeddings, to_gpu

# Import the SpreadsheetDataLoader class
from classes.SpreadsheetDataLoader import SpreadsheetDataLoader
from classes.TestRNN import TestRNN

# Other regular imports
import torch.nn as nn
import torch
from tqdm import tqdm
import gc
import os
import pandas as pd
import math
import time
import numpy as np
import random

# Setup device as a global constant
devstr = "cuda:1" # "cpu" 
gpu = False if (devstr == 'cpu') else True
DEVICE = 'cpu' if (devstr == 'cpu') else (torch.device(devstr if torch.cuda.is_available() else 'cpu') 
                                          if devstr else torch.cuda.current_device())
print(DEVICE)

cuda:1


# Vocab and DataLoader 

## Vocabulary

Get the vocabulary object from the helper function as well as the processed file paths.

In [2]:
# Set the directory containing the spreadsheets
data_dir = '../data/train_big/'

# Get the list of file paths
spreadsheet_vocab,file_paths = get_vocabulary(data_dir)

# Print info
print(f'\n\nVocabulary size: {len(spreadsheet_vocab._word2idx)}')
print(f'Files Processed: {len(file_paths)}')

PARA - Processing Files:   0%|          | 0/804 [00:00<?, ?it/s]



Vocabulary size: 105105
Files Processed: 804


## Word Embeddings

Create the vector representation for each word in the vocabulary using Glove that represents each word with a 50-dimensional vector else makes it a normally distributed random vector.

**NOTE**: You keep confusing word embeddings with words vectors, they are used interchangeably

In [3]:
# Create the embeddings for each word in the vocabulary and view info
spreadsheet_wvs = create_embeddings(spreadsheet_vocab)
print(f'Word Embeddings Shape: {spreadsheet_wvs.shape}')
print(f'\nExample Embedding for <unk> at index 0:\n{spreadsheet_wvs[0]}')

  0%|          | 0/105105 [00:00<?, ?it/s]

Word Embeddings Shape: torch.Size([105105, 50])

Example Embedding for <unk> at index 0:
tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
         0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473,
        -1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530,
         0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437,
        -0.6136,  0.0316,  1.0554,  0.1778, -0.2303, -0.3918,  0.5433, -0.3952,
         0.2055, -0.4503,  1.5210,  3.4105, -1.5312, -1.2341,  1.8197, -0.5515,
        -1.3253,  0.1886])


## Data Loader

DataLoader standardizes the data into uniform batches and we will be represent each spreadsheet as a 100x100x32 LongTensor because we standardize rows and columns as 100x100 and for each cell allow exactly 32 tokens. Therefore each index in x_tok list will be for a single spreadsheet.

In [4]:
# Create the SpreadsheetDataLoader object with the vocabulary and file paths and view
#check_loader = SpreadsheetDataLoader(file_paths, spreadsheet_vocab)
check_loader = SpreadsheetDataLoader(file_paths, spreadsheet_vocab)
print(f'Spreadsheets Processed: {len(check_loader)}')
print(f'x_tok Tensor Shape: {check_loader.x_tok[0].shape}')
print(f'y_tok Tensor Shape: {check_loader.y_tok[0].shape}')

Processing files: 100%|███████████████████████| 804/804 [00:36<00:00, 22.05it/s]


Spreadsheets Processed: 804
x_tok Tensor Shape: torch.Size([100, 100, 32])
y_tok Tensor Shape: torch.Size([100, 100, 17])


In [5]:
# # Validation file paths
# val_dir = '../data/val_big/'
# file_length = 10
# val_paths = file_paths = [ 
#     os.path.join(val_dir, filename) 
#     for filename in os.listdir(val_dir) 
#     if filename.lower().endswith(('.xls', '.xlsx', '.csv')) and os.path.isfile(os.path.join(val_dir, filename)) 
# ]
# val_paths = random.sample(val_paths, file_length)
# print(val_paths)

# # Dataloader for val
# val_loader = SpreadsheetDataLoader(val_paths, spreadsheet_vocab, 10, 10)
# print(f'Spreadsheets Processed: {len(val_loader)}')
# print(f'x_tok Tensor Shape: {val_loader.x_tok[0].shape}')
# print(f'y_tok Tensor Shape: {val_loader.y_tok[0].shape}')

Check the DataLoader with model by using a single file

In [6]:
# # Create a DataLoader from your check_loader
# test_loader = torch.utils.data.DataLoader(check_loader, batch_size=5, shuffle=False)

# # Get one batch from the DataLoader
# batch = next(iter(test_loader))

# # Move the extracted x_tok to gpu
# exfile = to_gpu(batch['x_tok'],1)

# # Define a new neural network model to be trained and transfer it to GPU
# hidden_state_dim = 100
# rnn_layers = 2
# rnn_model = to_gpu(TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs),1)

# # Observe the model
# rnn_model.forward(exfile)

# Training Function

In [8]:
def train_test(model, train_loader, batch_size=8, lr=1.4e-5, mu=0.25, max_epochs=4, patience=1, save_int = 2, save_dir='../models/'):
    """
    Train the model for 1 batch, print the length of the train_loader, the training loss, and average training loss.
    """

    # Check if save_int > 0 and save_dir exists
    if save_int > 0 and not os.path.exists(save_dir):
        raise ValueError(f"Directory '{save_dir}' DNE")

    # Define the path where to save model and logfile
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    model_path = os.path.join(save_dir, f"rnnbig_{timestamp}.pth")
    log_file = os.path.join(save_dir, f"rnnbig_{timestamp}.txt")

    # Setup optimizer
    opt = torch.optim.Adagrad(model.parameters(), lr=lr)

    # Convert incoming training DataLoader into batches
    batch_loader = torch.utils.data.DataLoader(train_loader, batch_size=batch_size, shuffle=True)

    # Calculate the class imbalance (ratio of non-bold to bold cells)
    num_bold_cells = sum((batch['y_tok'][:, :, :, 6] == 1).sum() for batch in batch_loader)

    # Binary Cross-Entropy Loss with Logits
    loss_fn = nn.BCEWithLogitsLoss(
        
        pos_weight = torch.tensor(
            
            [
                (
                    (len(batch_loader) * batch_size * 100 * 100) - num_bold_cells
                ) / num_bold_cells
            ],
            
            dtype = torch.float
        ).to(DEVICE)
    )

    # Define the starting epoch
    epoch = 0

    # Define the best average training loss, perplexity as inf max value and epoch as 0
    best_avgtrloss = float('inf')
    best_perp = float('inf')
    best_epoch = 0
    
    # Epochs without improvement counter and flag for training
    nimp_ctr = 0; training = True

    # Loop while model is in training mode and the epoch is less than max_epochs given
    while training and (epoch < max_epochs):

        # Print the epoch number and write to file also
        print(f'Epoch {epoch}')
        if save_int > 0:
            with open(log_file, 'a') as log:
                log.write(f"\nEpoch {epoch}\n")

        # Turn on training mode which enables dropout.
        model.train()

        # Initialize training loss
        curr_trloss = 0

        # Loop through the batches in batch_loader
        for i, batch in enumerate(tqdm(batch_loader, desc =f'Batch Processing')):

            # Clear any remaining gradients
            model.zero_grad()

            # Compute the loss
            loss = loss_fn(

                model(
                    
                    batch['x_tok'].to(DEVICE)
                
                ).view(-1), 
                
                batch['y_tok'][:, :, :, 6].to(DEVICE).view(-1).float() # Actual labels from dataloader

            )

            # Accumulate the training loss
            curr_trloss += loss.detach().cpu().item()

            # Compute the gradients of the model parameters by backpropagating the loss
            loss.backward()

            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=mu)

            # Update the model parameters
            opt.step()

            # Clear memory
            del loss


        # Calculate average training loss for this epoch
        curr_avgtrloss = curr_trloss / len(batch_loader)

        # Perplexity = e^(loss/(num_batches x batch_size x cells))
        curr_perp = math.exp(curr_trloss/(len(batch_loader) * batch_size * 10000))

        # Print the average training loss, perplexity for this current epoch and write to file
        print(f'Train Loss: {curr_avgtrloss}, Perplexity: {curr_perp}')
        if save_int > 0:
            with open(log_file, 'a') as log:
                log.write(f'Train Loss: {curr_avgtrloss}, Perplexity: {curr_perp}\n')

        # Save the model and log if current epoch is a multiple of save_int
        if save_int > 0 and (epoch + 1) % save_int == 0:
            torch.save(model.state_dict(), model_path)
            print(f"Model Saved")
            with open(log_file, 'a') as log: log.write(f"Model Saved\n")

        # Update the best loss, perplexity, epoch if current is best
        best_perp, best_avgtrloss, best_epoch, nimp_ctr = (
            (
                curr_perp, curr_avgtrloss, epoch, 0
            ) if curr_perp < best_perp 
            else (
                best_perp, best_avgtrloss, best_epoch, nimp_ctr + 1
            )
        )

        # Check if epochs without improvement have cross the patience threshold
        if nimp_ctr >= patience:

            # If they have then print early stopping message
            print(f"\nEARLY STOP Epoch {epoch}, Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
            if save_int > 0:
                with open(log_file, 'a') as log:
                    log.write(f"\nEARLY STOP Epoch {epoch}, Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
            training = False


        # Increment the epoch and print a new line
        epoch += 1
        print()

    # Save model and log at the end of training (or early stopping)
    if save_int > 0:
        torch.save(model.state_dict(), model_path)

    # Print training complete message
    print(f"\nTRAINING DONE Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")
    if save_int > 0:
        with open(log_file, 'a') as log:
            log.write(f"\nTRAINING DONE Best: Epoch = {best_epoch}, Train Loss = {best_avgtrloss}, Perplexity = {best_perp}")

    # Return trained model at the end
    return model

In [None]:
# Define a new neural network model to be trained and transfer it to GPU
hidden_state_dim = 100
rnn_layers = 2
test_model = TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs).to(DEVICE)


# # Load the state_dict from the saved file
# state_dict = torch.load('../models/rnnsmall_20240930_070804.pth')

# # Load the weights into the model
# test_model.load_state_dict(state_dict)

# Move the model to GPU device 2
#trained_testmodel = to_gpu(test_model, 1)

# Call the function to train the model
trained_testmodel = train_test(
    test_model, check_loader,
    batch_size=8, lr=1.4e-5, mu=0.25, max_epochs=6, patience=1,
    save_int=2, save_dir='../models/'
)



Epoch 0


Batch Processing:  46%|██████████▉             | 46/101 [41:56<49:17, 53.78s/it]

# Inference

In [45]:
def infer(trained_model, infer_loader, loc = 0):
    """
    Takes a trained model and a dataloader, and returns a 100x100 2D grid of predictions
    (1 for bold, 0 for not bold) for the first spreadsheet in the dataloader.

    Args:
        trained_model (nn.Module): The trained PyTorch model.
        infer_loader (DataLoader): Dataloader object for the inference files.
        device (str): The device to run the inference on ('cuda' or 'cpu').

    Returns:
        torch.Tensor: A 100x100 tensor where each element is 1 (bold) or 0 (not bold).
    """

    # Move the model to eval mode
    trained_model.eval()

    # Get the first spreadsheet from the inference data loader
    x_infer = to_gpu(infer_loader.x_tok[loc].unsqueeze(0),1)


    # Pass the input through the model and get predictions (no gradient needed for inference)
    with torch.no_grad():
        predictions = trained_model(x_infer)

    # Process the output, remove batch dimension, and apply sigmoid to get probabilities
    pred_grid = predictions.squeeze(0)  # Convert 1x100x100 -> 100x100
    
    # Set the print options to display more decimal places
    torch.set_printoptions(precision=20)

    pred_probs = torch.sigmoid(pred_grid)  # Convert logits to probabilities
    print(pred_probs)
    
    # Convert probabilities to binary (1 for bold, 0 for not bold), using 0.5 threshold
    pred_labels = (pred_probs >= 2.2e-7).long()

    # Return the 100x100 grid of prediction probabilities
    return pred_labels.detach().cpu(), infer_loader.y_tok[loc][:, :, 6], infer_loader.file_paths[loc]

In [46]:
# Define a new neural network model to be trained and transfer it to GPU
hidden_state_dim = 100
rnn_layers = 2
test_model2 = TestRNN(hidden_state_dim, rnn_layers, spreadsheet_wvs)

# Load the state_dict from the saved file
state_dict = torch.load('../models/rnnsmall_20241023_065704.pth')

# Load the weights into the model
test_model2.load_state_dict(state_dict)

# Move the model to GPU device 2
trained_testmodel2 = to_gpu(test_model2, 1)

In [8]:
# Define the directory to be used for inference
infer_dir = '../data/infer_small/'

# List all files in the directory and append the full path
infer_files = [
    os.path.join(infer_dir, filename)
    for filename in os.listdir(infer_dir)
    if filename.lower().endswith(('.xls', '.xlsx', '.csv'))  # Adjust the file extensions as needed
]

# Define the dataloader for inference
infer_loader = SpreadsheetDataLoader(infer_files, spreadsheet_vocab)

Processing files: 100%|██████████████████████████| 1/1 [00:00<00:00, 897.37it/s]


In [47]:
pred_bold, act_bold, file = infer(trained_testmodel2, check_loader, 212)
# Convert to pandas DataFrames for better display
pred_df = pd.DataFrame(pred_bold.numpy())  # Convert tensor to numpy, then to pandas DataFrame
actual_df = pd.DataFrame(act_bold.numpy())  # Convert tensor to numpy, then to pandas DataFrame

# Print the filename
print(f"\nFilename: {file}")

# Print the predictions
print("\nPredictions (1 = Bold, 0 = Not Bold):")
display(pred_df)

# Print the actual grid
print("\nActual Grid (1 = Bold, 0 = Not Bold):")
display(actual_df)

# Find the locations where the model predicted bold (1)
bold_pred_locations = pred_df[pred_df == 1].stack().index.tolist()

# Print the bold predictions
if bold_pred_locations:
    #print(f"Bold predictions at the following row, col locations: {bold_pred_locations}")
    pass
else:
    print("No bold predictions were made by the model.")
    
# Convert both DataFrames to numpy arrays for element-wise comparison
pred_np = pred_df.to_numpy()
actual_np = actual_df.to_numpy()

# Compare the predictions with the actual values (element-wise comparison)
correct_predictions = (pred_np == actual_np)

# Calculate accuracy: (number of correct predictions) / (total number of cells)
accuracy = correct_predictions.sum() / correct_predictions.size

# Print accuracy
print(f"\nAccuracy: {accuracy * 100:.2f}%")


tensor([[2.22218005774266202934e-07, 2.22218005774266202934e-07,
         2.22218005774266202934e-07,  ...,
         2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07],
        [2.22218005774266202934e-07, 2.22218005774266202934e-07,
         2.22218005774266202934e-07,  ...,
         2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07],
        [2.22218005774266202934e-07, 2.22218005774266202934e-07,
         2.22218005774266202934e-07,  ...,
         2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07],
        ...,
        [2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07,  ...,
         2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07],
        [2.11253890824991685804e-07, 2.11253890824991685804e-07,
         2.11253890824991685804e-07,  ...,
         2.11253890824991685804e-07

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0



Actual Grid (1 = Bold, 0 = Not Bold):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0



Accuracy: 91.82%


In [32]:
# Print the subset DataFrame
# Adjust display settings to show the full DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Do not wrap columns

# Find the last row with a 1
last_row_with_1 = pred_df[pred_df.eq(1).any(axis=1)].index[-1]

# Find the last column with a 1
last_col_with_1 = pred_df.loc[:, (pred_df == 1).any(axis=0)].columns[-1]

# Print the row and column up to which 1s are predicted
print(f"Last row with 1: {last_row_with_1}")
print(f"Last column with 1: {last_col_with_1}")


# Slice the DataFrame up to where 1s to observe
subset_df = pred_df.loc[:last_row_with_1, :last_col_with_1]
display(subset_df)

Last row with 1: 1
Last column with 1: 27


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [25]:
# Find locations of all 0 values in subset_df
zero_locations = list(zip(*np.where(subset_df == 0)))

# Print the list of (row, col) locations
print(f"Locations of 0 values: {zero_locations}")


Locations of 0 values: [(0, 28), (5, 0), (54, 1), (55, 1), (59, 1), (60, 1)]


In [21]:
# Find the last column that contains a 1
last_col_with_1 = pred_df.loc[:, (pred_df == 1).any(axis=0)].columns[-1]

# Print the row and column up to which 1s are predicted
print(f"Last row with 1: {last_row_with_1}")
print(f"Last column with 1: {last_col_with_1}")


Last row with 1: 60
Last column with 1: 28
