In [1]:
# For viewing and manipulating data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing the necessary libraries
import re
import math
import string
import nltk
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gensim.downloader as api
from gensim.models import KeyedVectors # >> alternative to gensim.downloader
import matplotlib.pyplot as plt

# Getting particular functions from these libraries 
from torch import Tensor
from sklearn.utils import resample
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import random_split, DataLoader, TensorDataset, Dataset

# Using the NLTK to tokenize the text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
data_filename = ''
dict_filename = ''
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        file_name = os.path.join(dirname, filename)
        if file_name.endswith('dataset-2000-200'):
            data_filename = file_name
        elif file_name.endswith('dict-2000-200'):
            dict_filename = file_name
        else:
            print(f'Unidentified file: {file_name}')
                
print(f'Preprocessed data file: {data_filename}; dictionary file: {dict_filename}')      

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
VERBOSE = True
def printv(text):
    if VERBOSE: print('VERBOSE:', text)
    return

def showV(text):
    '''unconditional verbose output'''
    print('VERBOSE:', text)
    return

DEV = False
def printd(text):
    if DEV: print('DEV:', text)
    return

def showD(text):
    '''unconditional DEV output'''
    print('DEV:', text)  #<< 4/12/24 changed "VERBOSE" to "DEV"
    return

showCellCompletion = True  #<< 4/12/24 set default to True
def showC(text):
    if showCellCompletion:
        print('Cell complete:', text)
    return

import subprocess
showNv = True
accelerator = True

def printNv():
    if not showNv or not accelerator: return
    mem_usage = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(mem_usage.stdout.decode('utf-8'))

showMemoryAllocation = True
def printM():
    if not showMemoryAllocation: return
    print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")

Unidentified file: /kaggle/input/preprocessed-data-1000-100/preprocessed_dataset-1000-100
Unidentified file: /kaggle/input/preprocessed-data-1000-100/preprocessed_dict-1000-100
Unidentified file: /kaggle/input/preprocessed-dataset/preprocessed_dataset
Unidentified file: /kaggle/input/preprocessed-dataset/preprocessed_dataset.json
Preprocessed data file: /kaggle/input/preprocessed-2000-per-category-200-max-review-size/preprocessed_dataset-2000-200; dictionary file: /kaggle/input/preprocessed-2000-per-category-200-max-review-size/preprocessed_dict-2000-200


In [2]:
# Checks if a CUDA enabled GPU is available and prints out its information
if torch.cuda.is_available():
    print("CUDA is available!")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        
    device = torch.device("cuda:0")
    accelerator = True

else:
    accelerator = False
    print("CUDA is not available.")
    device = torch.device("cpu")
    print(device)

CUDA is available!
GPU 0: Tesla P100-PCIE-16GB


In [3]:
import pickle
with open(data_filename, 'rb') as dataset_file:
    dataset = pickle.load(dataset_file)
    
with open(dict_filename, 'rb') as dict_file:
    data_params = pickle.load(dict_file)
 
print(f'Dataset description: {data_params["description"]}')
max_sequence_length  = data_params["max_sequence_length"]
    
train_len = int(0.8 * len(dataset))
val_len = len(dataset) - train_len

# Random split
train_data, val_data = random_split(dataset, [train_len, val_len])

printv(f"The amount of data we have to train with is {len(train_data)} reviews") 
printv(f"The amount of data we have to validate with is {len(val_data)} reviews")
#print(f"The amount of data we have to validate with is on {train_data.device}")
#print(f"The amount of data we have to validate with is on {val_data.device}")

# DataLoader for training data
train_loader = DataLoader(train_data, batch_size = 32, shuffle = True)  # Use shuffle for training

# DataLoader for validation data
val_loader = DataLoader(val_data, batch_size = 32, shuffle = False)  # No need to shuffle for validation

Dataset description: 2000 reviews per category; 200 max review size; removes punctuation, symbols, and numeric strings
VERBOSE: The amount of data we have to train with is 8000 reviews
VERBOSE: The amount of data we have to validate with is 2000 reviews


In [4]:
# HyperParameters for the module
d_model = 300  # Should match the embedding dimension of your word embeddings
seq_len = max_sequence_length #<<100 #<< 4/13/24 100  # Maximum sequence length
dropout = 0.1  # Adjust the dropout if needed

eps     = 1e-6 # epsilon value to prevent the standard deviation from becoming zero
num_classes = 5  # Replace with your number of classes
epochs = 200 #<< 1000
learning_rate = 0.01
num_layers = 6
hidden_size = d_model

input_size = d_model  # Adjust this based on the output size of your feed-forward network
# input_size = len(train_data[0])  # Adjust based on your input size (should match the output size of your model)
showC('Hyperparameters defined')

Cell complete: Hyperparameters defined


In [5]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, r_size, v_size, num_classes):
        # r_size is the number of tokens in a review, 100.
        # v_size is the number of values in an embedding vector, 300.
        super(NeuralNetClassifier, self).__init__()
        
        # The input to fc will be a 2D tensor with with n rows and
        # r_size * v_size columns, where n >= 1; and the output will be a 2D tensor
        # with n rows and num_classes columns.
        self.hidden_layer1 = nn.Linear(r_size * v_size, hidden_size)
        self.hidden_layer2 = nn.Linear(hidden_size, hidden_size)
        self.hidden_layer3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout) #>> 0.2 seems to work OK
        #self.softmax = nn.Softmax(dim=1)  # Softmax with dim=1 for class probabilities

    def forward(self, x):
        x = self.hidden_layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.hidden_layer2(x)
        x = self.relu(x)
        x = self.dropout(x) #>> dropout rate 0.1 may result in underfitting?
        #x = self.softmax(x)  # Apply softmax after the output layer
        x = self.hidden_layer3(x)
        return x
    
classifier = NeuralNetClassifier(seq_len, d_model, num_classes + 1).to(device)
showC(f'{classifier} defined')

Cell complete: NeuralNetClassifier(
  (hidden_layer1): Linear(in_features=60000, out_features=300, bias=True)
  (hidden_layer2): Linear(in_features=300, out_features=300, bias=True)
  (hidden_layer3): Linear(in_features=300, out_features=6, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
) defined


In [None]:
'''
class Classifier(nn.Module):
    def __init__(self, r_size,v_size, num_classes):
        # r_size is the number of tokens in a review, 100.
        # v_size is the number of values in an embedding vector, 300.
        super(Classifier, self).__init__()
        
        # The input to fc will be a 2D tensor with with n rows and
        # r_size * v_size columns, where n >= 1; and the output will be a 2D tensor
        # with n rows and num_classes columns.
        self.fc = nn.Linear(r_size * v_size, num_classes)

    def forward(self, x1):
        # Pass input through the linear layer
        return self.fc(x1)

# Create the classifier
classifier = Classifier(seq_len, d_model, num_classes + 1).to(device)

showC(f'{Classifier} defined')
'''

In [6]:
class EarlyStopping():
    """
    Early stopping to stop the training when the loss does not improve after
    certain epochs.
    Credit:
    https://debuggercafe.com/using-learning-rate-scheduler-and-early-stopping-with-pytorch/
    """
    def __init__(self, patience=5, min_delta=0):
        """
        :param patience: how many epochs to wait before stopping when loss is
               not improving
        :param min_delta: minimum difference between new loss and old loss for
               new loss to be considered as an improvement
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            #printd(f"INFO: Early stopping counter {self.counter} of {self.patience}")
            if self.counter >= self.patience:
                printv(f'Early stopping: counter={self.counter}; patience={self.patience}')
                self.early_stop = True
early_stopping = EarlyStopping(patience=16) #patience=30, min_delta=.01) 
showC(f'{EarlyStopping} object defined')

Cell complete: <class '__main__.EarlyStopping'> object defined


In [7]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def restore_from_checkpoint(model, filename):
    model.load_state_dict(torch.load(filename))

best_epoch = -1
lowest_loss = 100

def set_best(model,loss, epoch,filename): 
    global best_epoch
    global lowest_loss
    if loss < lowest_loss:
        lowest_loss = loss
        best_epoch = epoch
        checkpoint(model,filename)

In [8]:
# Define Cross-Entropy loss
criterion = nn.CrossEntropyLoss() # nn.CategoricalCrossentropy() #nn.Softmax() 

# Define SGD optimizer
# Is Adam better? Didn't seem so based on 4/23 heuristics
optimizer = optim.SGD(classifier.parameters(), lr=learning_rate)

DEV = True
# Training loop (adjust this to match your data and DataLoader)
losses = {} #<< track losses
for epoch in range(epochs):
    for inputs, targets in train_loader :  # Assuming you have a DataLoader
        # for batch_data in train_loader:  # Assuming you have a DataLoader
        # inputs, targets = batch_data  # Assuming your DataLoader provides input data and targets    
    
        #printd(f'inputs shape: {inputs.shape}')
        #printd(f'targets shape: {targets.shape}')
        #printd(f'targets: {targets}')

        optimizer.zero_grad()
    
        # keep nn.linear happy by combining the last two dimensions of inputs.
        inputs.to(device)
        inputs = torch.reshape(inputs, (inputs.size(0), -1)).to(device) # get current batch size
        #inputs = torch.reshape(inputs, (32,30000))
        
        #printd(f'Reshaped inputs: {inputs.shape}')
        
        outputs = classifier(inputs).to(device)
        #printd(f'outputs shape {outputs.shape}')
        
        # output is a 32 x 6 tensor of floats,
        # targets will be a 32 x 1 tensor of ints
        loss = criterion(outputs.to(device), targets.to(device))
        # print(f'loss.item: {loss.item()}')
        loss.backward(retain_graph=True)
        optimizer.step()
        
    set_best(classifier,loss.item(), epoch,'best_model.pth')
    losses[loss.item()] = epoch+1
    early_stopping(loss)    
    if early_stopping.early_stop:
       printv(f'Stopping early at epoch [{epoch+1}/{epochs}] Loss: {loss.item()}')
       break    
        

    if epoch % 50 == 0:
        printv(f'Epoch [{epoch+1}/{epochs}] Loss: {loss.item()}')
        
restore_from_checkpoint(classifier, 'best_model.pth')
print(f'Using classifer trained at epoch {best_epoch}, loss = {lowest_loss} ')    

if VERBOSE:
    printv(f'Last loss: Epoch [{epoch+1}/{epochs}] Loss: {loss.item()}')
    smallest_losses = sorted(list(losses.keys()))
    printv('Smallest losses')
    for idx in range(5):
        l = smallest_losses[idx]
        printv(f'    Loss: {l}, epoch = {losses[l]}')
showC(f'training complete')

VERBOSE: Epoch [1/200] Loss: 1.6922454833984375
VERBOSE: Epoch [51/200] Loss: 0.009921971708536148
VERBOSE: Epoch [101/200] Loss: 0.002677323529496789
VERBOSE: Early stopping: counter=16; patience=16
VERBOSE: Stopping early at epoch [103/200] Loss: 0.003515572752803564
Using classifer trained at epoch 86, loss = 0.001787966233678162 
VERBOSE: Last loss: Epoch [103/200] Loss: 0.003515572752803564
VERBOSE: Smallest losses
VERBOSE:     Loss: 0.001787966233678162, epoch = 87
VERBOSE:     Loss: 0.0018257200717926025, epoch = 102
VERBOSE:     Loss: 0.0018786011496558785, epoch = 94
VERBOSE:     Loss: 0.002018197439610958, epoch = 95
VERBOSE:     Loss: 0.0021247644908726215, epoch = 71
Cell complete: training complete


In [9]:
# Put model in evaluation mode
classifier.eval() 

# Tracking variables
predictions = []
actuals = []


# Evaluate on validation set
with torch.no_grad():
    for inputs, targets in val_loader:
        inputs = inputs.reshape(inputs.shape[0], -1).to(device)

        outputs = classifier(inputs)
        _, predicted = torch.max(outputs, 1)

        predictions.extend(predicted.tolist())
        actuals.extend(targets.tolist())


# Print predicted and actual values for all samples
#>> print("Predicted | Actual")
#>>for pred, actual in zip(predictions, actuals):
#>>    pass #printd(f"{pred} | {actual}")

# Calculate validation accuracy
#>> 4/12/24 Maybe it would help to see how close we came in each category?
#>> For example, for category 5 predictions, show the actual results in each 
#>> category. And where's there's a large disrepancy, show  the reviews.
r_by_category = [0,0,0,0,0]
r = list('12345')

for idx in range(5):
    r[idx] = r_by_category[:]
       
for p,a in zip(predictions, actuals):
    r[p-1][a-1] += 1 # Record the actual results for each category prediction

DEV = False
num_correct = 0
for idx in range (5):
    printv(f'Categrory {idx+1} predictions actual results: ' +\
           f'1. {r[idx][0]}; 2. {r[idx][1]}; 3. {r[idx][2]}; 4. {r[idx][3]}; 5. {r[idx][4]}')
    total_category = r[idx][0] + r[idx][1] + r[idx][2] + r[idx][3] + r[idx][4]
    printd(f'{total_category} in category {idx+1}')

    num_correct += r[idx][idx]
# num_correct = sum([p == a for p, a in zip(predictions, actuals)]) 
val_accuracy = num_correct / len(predictions)
print(f'Validation Accuracy: {val_accuracy:.2f}')

VERBOSE: Categrory 1 predictions actual results: 1. 177; 2. 94; 3. 60; 4. 39; 5. 51
VERBOSE: Categrory 2 predictions actual results: 1. 87; 2. 129; 3. 75; 4. 32; 5. 31
VERBOSE: Categrory 3 predictions actual results: 1. 64; 2. 72; 3. 116; 4. 94; 5. 58
VERBOSE: Categrory 4 predictions actual results: 1. 29; 2. 60; 3. 98; 4. 144; 5. 103
VERBOSE: Categrory 5 predictions actual results: 1. 39; 2. 37; 3. 40; 4. 87; 5. 184
Validation Accuracy: 0.38
