In [1]:
# For viewing and manipulating data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing the necessary libraries
import re
import math
import string
import nltk
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gensim.downloader as api
from gensim.models import KeyedVectors # >> alternative to gensim.downloader
import matplotlib.pyplot as plt

# Getting particular functions from these libraries 
from torch import Tensor
from sklearn.utils import resample
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import random_split, DataLoader, TensorDataset, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Using the NLTK to tokenize the text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
dataset_file_name = ''
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        file_name = os.path.join(dirname, filename)
        if file_name.endswith('hdf5'):
            dataset_file_name = file_name
        else:
            print(f'Found unexpected file: {file_name}')
                
print(f'Preprocessed data file: {dataset_file_name}')

# Checks if a CUDA enabled GPU is available and prints out its information
if torch.cuda.is_available():
    print("CUDA is available!")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        
    device = torch.device("cuda:0")
    accelerator = True

else:
    accelerator = False
    print("CUDA is not available.")
    device = torch.device("cpu")
    print(device)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
VERBOSE = True
def printv(text):
    if VERBOSE: print('VERBOSE:', text)
    return

def showV(text):
    '''unconditional verbose output'''
    print('VERBOSE:', text)
    return

DEV = False
def printd(text):
    if DEV: print('DEV:', text)
    return

def showD(text):
    '''unconditional DEV output'''
    print('DEV:', text)  #<< 4/12/24 changed "VERBOSE" to "DEV"
    return

showCellCompletion = True  #<< 4/12/24 set default to True
def showC(text):
    if showCellCompletion:
        print('Cell complete:', text)
    return

import subprocess
showNv = True
accelerator = True

def printNv():
    if not showNv or not accelerator: return
    mem_usage = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(mem_usage.stdout.decode('utf-8'))

showMemoryAllocation = True
def printM():
    if not showMemoryAllocation: return
    print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")

Preprocessed data file: /kaggle/input/80000/extracted_data.hdf5
CUDA is available!
GPU 0: Tesla P100-PCIE-16GB


In [2]:
import h5py

file_path = '/kaggle/input/80000/extracted_data.hdf5'
with h5py.File(file_path, 'r') as hf:
    # Access the datasets within the HDF5 file
    text_reviews_dataset = hf['text_reviews']
    ratings_dataset = hf['ratings']

    # Convert the datasets to PyTorch tensors
    text_reviews = torch.from_numpy(text_reviews_dataset[:])
    ratings = torch.from_numpy(ratings_dataset[:])

# Use the loaded tensors as needed
print(text_reviews.shape)
print(ratings.shape)

torch.Size([80000, 100, 300])
torch.Size([80000])


In [3]:
# import pickle
# with open(dataset_file_name, 'rb') as dataset_file:
#     dataset = pickle.load(dataset_file)

In [4]:
dataset = TensorDataset(text_reviews, ratings)

# Assuming your data is stored in 'dataset' as a PyTorch dataset object
X = dataset.tensors[0]  # Assuming the reviews are stored at index 1 in the dataset tensors
y = dataset.tensors[1]  # Assuming the ratings are stored at index 0 in the dataset tensors

print("Shape of X (reviews):", X.shape)
print("Shape of y (ratings):", y.shape)

X_train = []
X_val = []
y_train = []
y_val = []

# Perform stratified splitting for each rating class
for rating in torch.unique(y):
    X_rating = X[y == rating]
    y_rating = y[y == rating]

    X_train_rating, X_val_rating, y_train_rating, y_val_rating = train_test_split(
        X_rating, y_rating, test_size = 0.2, random_state = 42, stratify = y_rating)

    X_train.append(X_train_rating)
    X_val.append(X_val_rating)
    y_train.append(y_train_rating)
    y_val.append(y_val_rating)

# Combine the split data from all rating classes
X_train = torch.cat(X_train)
X_val = torch.cat(X_val)
y_train = torch.cat(y_train)
y_val = torch.cat(y_val)

# Create new datasets using the split data
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)

print("Training Set:")
print("Number of ratings:", len(y_train))
print("Number of reviews:", len(X_train))
print("Number of reviews per rating:")
for rating in torch.unique(y_train):
    count = torch.sum(y_train == rating).item()
    print(f"Rating {rating}: {count} reviews")

print("\nValidation Set:")
print("Number of ratings:", len(y_val))
print("Number of reviews:", len(X_val))
print("Number of reviews per rating:")
for rating in torch.unique(y_val):
    count = torch.sum(y_val == rating).item()
    print(f"Rating {rating}: {count} reviews")

printv(f"The amount of data we have to train with is {len(train_data)} reviews") 
printv(f"The amount of data we have to validate with is {len(val_data)} reviews")

# DataLoader for training data
train_loader = DataLoader(train_data, batch_size = 32, shuffle = True)  # Use shuffle for training

# DataLoader for validation data
val_loader = DataLoader(val_data, batch_size = 32, shuffle = False)  # No need to shuffle for validation

Shape of X (reviews): torch.Size([80000, 100, 300])
Shape of y (ratings): torch.Size([80000])
Training Set:
Number of ratings: 64000
Number of reviews: 64000
Number of reviews per rating:
Rating 1: 12800 reviews
Rating 2: 12800 reviews
Rating 3: 12800 reviews
Rating 4: 12800 reviews
Rating 5: 12800 reviews

Validation Set:
Number of ratings: 16000
Number of reviews: 16000
Number of reviews per rating:
Rating 1: 3200 reviews
Rating 2: 3200 reviews
Rating 3: 3200 reviews
Rating 4: 3200 reviews
Rating 5: 3200 reviews
VERBOSE: The amount of data we have to train with is 64000 reviews
VERBOSE: The amount of data we have to validate with is 16000 reviews


In [5]:
# HyperParameters for the model
d_model = 300  # Should match the embedding dimension of your word embeddings
seq_len = 100 #<< 4/13/24 100  # Maximum sequence length
dropout = 0.1  # Adjust the dropout if needed

num_layers = 10 # depth of our network
input_size = d_model  # match the output dim of your ff_net
num_classes = 5  # our ratings (1 - 5)
hidden_size = 1024 # 2^n

eps    = 1e-05 # epsilon value to prevent the standard deviation from becoming zero
epochs = 200 #<< 1000
learning_rate = 0.001
weight_decay  = 0.01

showC('Hyperparameters defined')

Cell complete: Hyperparameters defined


In [6]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, r_size, v_size, num_classes, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout):
        super(NeuralNetClassifier, self).__init__()
        
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(r_size * v_size, hidden_size))
        self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
        for _ in range(num_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
        self.output_layer = nn.Linear(hidden_size, num_classes)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
            x = self.relu(x)
            x = self.dropout(x)
        
        x = self.output_layer(x)
        return x

In [7]:
classifier = NeuralNetClassifier(seq_len, d_model, num_classes, hidden_size, num_layers, dropout)
classifier = classifier.to(device)

print(classifier)

NeuralNetClassifier(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=30000, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=1024, out_features=1024, bias=True)
    (3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=1024, out_features=1024, bias=True)
    (5): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Linear(in_features=1024, out_features=1024, bias=True)
    (9): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=1024, out_features=1024, bias=True)
    (11): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Linea

# neural net with rnn

In [8]:
# class RecurrentNeuralNetClassifier(nn.Module):
#     def __init__(self, r_size, v_size, num_classes, hidden_size = hidden_size, 
#                  num_layers = num_layers, dropout = dropout, rnn_hidden_size = 256, rnn_num_layers = 1):
#         super(RecurrentNeuralNetClassifier, self).__init__()

#         self.rnn = nn.RNN(r_size * v_size, rnn_hidden_size, rnn_num_layers, batch_first=True)
        
#         self.hidden_layers = nn.ModuleList()
#         self.hidden_layers.append(nn.Linear(rnn_hidden_size, hidden_size))
#         self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
#         for _ in range(num_layers - 1):
#             self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
#             self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
#         self.output_layer = nn.Linear(hidden_size, num_classes)
#         self.relu = nn.LeakyReLU()
#         self.dropout = nn.Dropout(dropout)
        
#     def forward(self, x):
#         # Reshape the input to match the expected shape for RNN
#         x = x.view(x.size(0), -1, x.size(-1))
        
#         # Pass the input through the RNN layer
#         x, _ = self.rnn(x)
        
#         # Take the last output of the RNN
#         x = x[:, -1, :]
        
#         for layer in self.hidden_layers:
#             x = layer(x)
#             x = self.relu(x)
#             x = self.dropout(x)
        
#         x = self.output_layer(x)
#         return x

In [9]:
# classifier = RecurrentNeuralNetClassifier(seq_len, d_model, num_classes, hidden_size, num_layers, dropout)
# classifier = classifier.to(device)

# print(classifier)

# early neural nets

In [10]:
# class NeuralNetClassifier(nn.Module):
#     def __init__(self, r_size, v_size, num_classes):
#         # r_size is the number of tokens in a review, 100.
#         # v_size is the number of values in an embedding vector, 300.
#         super(NeuralNetClassifier, self).__init__()
        
#         # The input to fc will be a 2D tensor with with n rows and
#         # r_size * v_size columns, where n >= 1; and the output will be a 2D tensor
#         # with n rows and num_classes columns.
#         self.hidden_layer1 = nn.Linear(r_size * v_size, hidden_size)
#         self.hidden_layer2 = nn.Linear(hidden_size, hidden_size)
#         self.hidden_layer3 = nn.Linear(hidden_size, num_classes)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(dropout) #>> 0.2 seems to work OK
#         #self.softmax = nn.Softmax(dim=1)  # Softmax with dim=1 for class probabilities

#     def forward(self, x):
#         x = self.hidden_layer1(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         x = self.hidden_layer2(x)
#         x = self.relu(x)
#         x = self.dropout(x) #>> dropout rate 0.1 may result in underfitting?
#         #x = self.softmax(x)  # Apply softmax after the output layer
#         x = self.hidden_layer3(x)
#         return x
    
# classifier = NeuralNetClassifier(seq_len, d_model, num_classes + 1).to(device)
# showC(f'{classifier} defined')

'''
class Classifier(nn.Module):
    def __init__(self, r_size,v_size, num_classes):
        # r_size is the number of tokens in a review, 100.
        # v_size is the number of values in an embedding vector, 300.
        super(Classifier, self).__init__()
        
        # The input to fc will be a 2D tensor with with n rows and
        # r_size * v_size columns, where n >= 1; and the output will be a 2D tensor
        # with n rows and num_classes columns.
        self.fc = nn.Linear(r_size * v_size, num_classes)

    def forward(self, x1):
        # Pass input through the linear layer
        return self.fc(x1)

# Create the classifier
classifier = Classifier(seq_len, d_model, num_classes + 1).to(device)

showC(f'{Classifier} defined')
'''

"\nclass Classifier(nn.Module):\n    def __init__(self, r_size,v_size, num_classes):\n        # r_size is the number of tokens in a review, 100.\n        # v_size is the number of values in an embedding vector, 300.\n        super(Classifier, self).__init__()\n        \n        # The input to fc will be a 2D tensor with with n rows and\n        # r_size * v_size columns, where n >= 1; and the output will be a 2D tensor\n        # with n rows and num_classes columns.\n        self.fc = nn.Linear(r_size * v_size, num_classes)\n\n    def forward(self, x1):\n        # Pass input through the linear layer\n        return self.fc(x1)\n\n# Create the classifier\nclassifier = Classifier(seq_len, d_model, num_classes + 1).to(device)\n\nshowC(f'{Classifier} defined')\n"

# stoppage

In [11]:
# class EarlyStopping():
#     """
#     Early stopping to stop the training when the loss does not improve after
#     certain epochs.
#     Credit:
#     https://debuggercafe.com/using-learning-rate-scheduler-and-early-stopping-with-pytorch/
#     """
#     def __init__(self, patience=5, min_delta=0):
#         """
#         :param patience: how many epochs to wait before stopping when loss is
#                not improving
#         :param min_delta: minimum difference between new loss and old loss for
#                new loss to be considered as an improvement
#         """
#         self.patience = patience
#         self.min_delta = min_delta
#         self.counter = 0
#         self.best_loss = None
#         self.early_stop = False
        
#     def __call__(self, val_loss):
#         if self.best_loss == None:
#             self.best_loss = val_loss
#         elif self.best_loss - val_loss > self.min_delta:
#             self.best_loss = val_loss
#             # reset counter if validation loss improves
#             self.counter = 0
#         elif self.best_loss - val_loss < self.min_delta:
#             self.counter += 1
#             #printd(f"INFO: Early stopping counter {self.counter} of {self.patience}")
#             if self.counter >= self.patience:
#                 printv(f'Early stopping: counter={self.counter}; patience={self.patience}')
#                 self.early_stop = True
# early_stopping = EarlyStopping(patience=20) #patience=30, min_delta=.01) 
# showC(f'{EarlyStopping} object defined')

In [12]:
# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizer
# Is Adam better? Didn't seem so based on 4/23 heuristics
# optimizer = optim.SGD(classifier.parameters(), lr = learning_rate)
optimizer = optim.AdamW(classifier.parameters(), 
                        lr = learning_rate, weight_decay = weight_decay)

DEV = True

# Training loop
losses = {} #<< track losses
for epoch in range(epochs):
    for inputs, targets in train_loader : 

        optimizer.zero_grad()

        # keep nn.linear happy by combining the last two dimensions of inputs.
        inputs.to(device)
        targets = targets.to(device) - 1  # Convert ratings from [1, 5] to [0, 4]
        inputs = torch.reshape(inputs, (inputs.size(0), -1)).to(device) # get current batch size

        outputs = classifier(inputs).to(device)

        # output is a 32 x 6 tensor of floats,
        # targets will be a 32 x 1 tensor of ints
        # loss = criterion(outputs.to(device), targets.to(device))
        loss = criterion(outputs.to(device), targets.to(device).long())
        loss.backward(retain_graph = True)

        optimizer.step()

    losses[loss.item()] = epoch + 1
#    early_stopping(loss)

#     if early_stopping.early_stop:
#         printv(f'Stopping early at epoch [{epoch + 1} / {epochs}] Loss: {loss.item()}')
#         break    

    # if epoch % 50 == 0:
    printv(f'Epoch [{epoch + 1} / {epochs}] Loss: {loss.item()}')

if VERBOSE:
    printv(f'Last loss: Epoch [{epoch + 1} / {epochs}] Loss: {loss.item()}')
    smallest_losses = sorted(list(losses.keys()))
    printv('Smallest losses')
    for idx in range(3):
        l = smallest_losses[idx]
        printv(f'    Loss: {l}, epoch = {losses[l]}')
showC(f'training complete')

VERBOSE: Epoch [1 / 200] Loss: 1.6648303270339966
VERBOSE: Epoch [2 / 200] Loss: 1.7436070442199707
VERBOSE: Epoch [3 / 200] Loss: 1.2785296440124512
VERBOSE: Epoch [4 / 200] Loss: 1.335050344467163
VERBOSE: Epoch [5 / 200] Loss: 1.1342482566833496
VERBOSE: Epoch [6 / 200] Loss: 1.284279227256775
VERBOSE: Epoch [7 / 200] Loss: 1.4356555938720703
VERBOSE: Epoch [8 / 200] Loss: 0.6907355189323425
VERBOSE: Epoch [9 / 200] Loss: 1.1115167140960693
VERBOSE: Epoch [10 / 200] Loss: 0.6038281321525574
VERBOSE: Epoch [11 / 200] Loss: 0.8765528798103333
VERBOSE: Epoch [12 / 200] Loss: 0.4267658293247223
VERBOSE: Epoch [13 / 200] Loss: 0.331423282623291
VERBOSE: Epoch [14 / 200] Loss: 0.23514196276664734
VERBOSE: Epoch [15 / 200] Loss: 0.735434889793396
VERBOSE: Epoch [16 / 200] Loss: 0.29894667863845825
VERBOSE: Epoch [17 / 200] Loss: 0.2077620029449463
VERBOSE: Epoch [18 / 200] Loss: 0.3255316913127899
VERBOSE: Epoch [19 / 200] Loss: 0.31575867533683777
VERBOSE: Epoch [20 / 200] Loss: 0.1762191

In [13]:
# Put model in evaluation mode
classifier.eval() 

# Tracking variables
predictions = []
actuals = []

# Evaluate on validation set
with torch.no_grad():
    for inputs, targets in val_loader:
        inputs = inputs.reshape(inputs.shape[0], -1).to(device)
        targets = targets.to(device) - 1  # Convert ratings from [1, 5] to [0, 4]

        outputs = classifier(inputs)
        _, predicted = torch.max(outputs, 1)

        predictions.extend(predicted.tolist())
        actuals.extend(targets.tolist())

# Calculate evaluation metrics
accuracy = accuracy_score(actuals, predictions)
precision = precision_score(actuals, predictions, average='weighted')
recall = recall_score(actuals, predictions, average='weighted')
f1 = f1_score(actuals, predictions, average='weighted')

# Print evaluation metrics
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

# Calculate confusion matrix
cm = confusion_matrix(actuals, predictions)
print("Confusion Matrix:")
print(cm)

# # Analyze predictions by category
# num_categories = len(cm)
# for idx in range(num_categories):
#     print(f"Category {idx+1} predictions actual results:")
#     for j in range(num_categories):
#         print(f"{j+1}. {cm[idx][j]}")

# Assess bias and variance
if accuracy < 0.7:  # Adjust the threshold as per your requirements
    print("The model may have high bias (underfitting). Consider increasing model complexity.")
elif accuracy > 0.95:  # Adjust the threshold as per your requirements
    print("The model may have high variance (overfitting). Consider regularization techniques.")
else:
    print("The model seems to have a good balance between bias and variance.")
    
# r_by_category = [0,0,0,0,0]
# r = list('12345')

# for idx in range(5):
#     r[idx] = r_by_category[:]

# for p,a in zip(predictions, actuals):
#     r[p-1][a-1] += 1 # Record the actual results for each category prediction

# num_correct = 0
# for idx in range (5):
#     printv(f'Categrory {idx+1} predictions actual results: ' +\
#            f'1. {r[idx][0]}; 2. {r[idx][1]}; 3. {r[idx][2]}; 4. {r[idx][3]}; 5. {r[idx][4]}')
#     num_correct += r[idx][idx]

# # num_correct = sum([p == a for p, a in zip(predictions, actuals)]) 
# val_accuracy = num_correct / len(predictions)
# print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.5069
Validation Precision: 0.5129
Validation Recall: 0.5069
Validation F1-score: 0.5060
Confusion Matrix:
[[1857  653  178  221  291]
 [ 552 1729  282  323  314]
 [ 377  552 1278  571  422]
 [ 223  369  314 1498  796]
 [ 205  246  171  830 1748]]
The model may have high bias (underfitting). Consider increasing model complexity.


In [14]:
# # Put model in evaluation mode
# classifier.eval() 

# # Tracking variables
# predictions = []
# actuals = []

# # Evaluate on validation set
# with torch.no_grad():
#     for inputs, targets in val_loader:
#         inputs = inputs.reshape(inputs.shape[0], -1).to(device)

#         outputs = classifier(inputs)
#         _, predicted = torch.max(outputs, 1)

#         predictions.extend(predicted.tolist())
#         actuals.extend(targets.tolist())

# # Print predicted and actual values for all samples
# #>> print("Predicted | Actual")
# #>>for pred, actual in zip(predictions, actuals):
# #>>    pass #printd(f"{pred} | {actual}")

# # Calculate validation accuracy
# #>> 4/12/24 Maybe it would help to see how close we came in each category?
# #>> For example, for category 5 predictions, show the actual results in each 
# #>> category. And where's there's a large disrepancy, show  the reviews.
# r_by_category = [0,0,0,0,0]
# r = list('12345')

# for idx in range(5):
#     r[idx] = r_by_category[:]

# for p,a in zip(predictions, actuals):
#     r[p-1][a-1] += 1 # Record the actual results for each category prediction

# num_correct = 0
# for idx in range (5):
#     printv(f'Categrory {idx+1} predictions actual results: ' +\
#            f'1. {r[idx][0]}; 2. {r[idx][1]}; 3. {r[idx][2]}; 4. {r[idx][3]}; 5. {r[idx][4]}')
#     num_correct += r[idx][idx]

# # num_correct = sum([p == a for p, a in zip(predictions, actuals)]) 
# val_accuracy = num_correct / len(predictions)
# print(f'Validation Accuracy: {val_accuracy:.2f}')