In [1]:
# For viewing and manipulating data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing the necessary libraries
import re
import math
import string
import nltk
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gensim.downloader as api
from gensim.models import KeyedVectors # >> alternative to gensim.downloader
import matplotlib.pyplot as plt

# Getting particular functions from these libraries 
from torch import Tensor
from sklearn.utils import resample
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import random_split, DataLoader, TensorDataset, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Using the NLTK to tokenize the text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
dataset_file_name = ''
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        file_name = os.path.join(dirname, filename)
        if file_name.endswith('hdf5'):
            dataset_file_name = file_name
        else:
            print(f'Found unexpected file: {file_name}')
                
print(f'Preprocessed data file: {dataset_file_name}')

# Checks if a CUDA enabled GPU is available and prints out its information
if torch.cuda.is_available():
    print("CUDA is available!")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        
    device = torch.device("cuda:0")
    accelerator = True

else:
    accelerator = False
    print("CUDA is not available.")
    device = torch.device("cpu")
    print(device)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
VERBOSE = True
def printv(text):
    if VERBOSE: print('VERBOSE:', text)
    return

def showV(text):
    '''unconditional verbose output'''
    print('VERBOSE:', text)
    return

DEV = False
def printd(text):
    if DEV: print('DEV:', text)
    return

def showD(text):
    '''unconditional DEV output'''
    print('DEV:', text)  #<< 4/12/24 changed "VERBOSE" to "DEV"
    return

showCellCompletion = True  #<< 4/12/24 set default to True
def showC(text):
    if showCellCompletion:
        print('Cell complete:', text)
    return

import subprocess
showNv = True
accelerator = True

def printNv():
    if not showNv or not accelerator: return
    mem_usage = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(mem_usage.stdout.decode('utf-8'))

showMemoryAllocation = True
def printM():
    if not showMemoryAllocation: return
    print(f"Total allocated memory: {torch.cuda.memory_allocated()} bytes")
    
import h5py
from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter

Preprocessed data file: /kaggle/input/140000/review_data.hdf5
CUDA is available!
GPU 0: Tesla P100-PCIE-16GB


In [2]:
file_path = '/kaggle/input/140000/review_data.hdf5'
with h5py.File(file_path, 'r') as hf:
    # Access the datasets within the HDF5 file
    text_reviews_dataset = hf['text_reviews']
    ratings_dataset = hf['ratings']

    # Convert the datasets to PyTorch tensors
    text_reviews = torch.from_numpy(text_reviews_dataset[:])
    ratings = torch.from_numpy(ratings_dataset[:])

# check loaded tensors
print(text_reviews.shape)
print(ratings.shape)

torch.Size([140000, 100, 300])
torch.Size([140000])


In [3]:
class ReviewDataset(Dataset):
    def __init__(self, text_reviews, ratings):
        self.text_reviews = text_reviews
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, index):
        review = self.text_reviews[index]
        rating = self.ratings[index]
        return review, rating

# Create the dataset
dataset = ReviewDataset(text_reviews, ratings)

# Perform stratified splitting on indices
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
train_indices, val_indices = next(sss.split(range(len(dataset)), dataset.ratings))

# Create subsets using the indices
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)

print (len(train_indices))
print (len(val_indices))

112000
28000


In [4]:
print("Training Set:")
print("Number of reviews:", len(train_dataset))
print("Number of reviews per rating:")
train_ratings = [rating.item() for _, rating in train_dataset]
train_rating_counts = Counter(train_ratings)
for rating, count in sorted(train_rating_counts.items()):
    print(f"Rating {rating}: {count} reviews")

print("\nValidation Set:")
print("Number of reviews:", len(val_dataset))
print("Number of reviews per rating:")
val_ratings = [rating.item() for _, rating in val_dataset]
val_rating_counts = Counter(val_ratings)
for rating, count in sorted(val_rating_counts.items()):
    print(f"Rating {rating}: {count} reviews")
    
assert len(train_dataset) == 112000, f"Expected 112000 training reviews, got {len(train_dataset)}"
assert len(val_dataset) == 28000, f"Expected 28000 validation reviews, got {len(val_dataset)}"

for rating in sorted(train_rating_counts.keys()):
    train_count = train_rating_counts[rating]
    val_count = val_rating_counts[rating]
    assert train_count == val_count * 4, f"Rating {rating}: Expected 4 times more reviews in training set than validation set"

Training Set:
Number of reviews: 112000
Number of reviews per rating:
Rating 1: 22400 reviews
Rating 2: 22400 reviews
Rating 3: 22400 reviews
Rating 4: 22400 reviews
Rating 5: 22400 reviews

Validation Set:
Number of reviews: 28000
Number of reviews per rating:
Rating 1: 5600 reviews
Rating 2: 5600 reviews
Rating 3: 5600 reviews
Rating 4: 5600 reviews
Rating 5: 5600 reviews


In [5]:
# HyperParameters for the model
d_model = 300  # Should match the embedding dimension of your word embeddings
seq_len = 100 #<< 4/13/24 100  # Maximum sequence length
dropout = 0.1  # Adjust the dropout if needed

num_layers = 25 # depth of our network
input_size = d_model  # match the output dim of your ff_net
num_classes = 5  # our ratings (1 - 5)
hidden_size = 2048 # 2^n

eps    = 1e-05 # epsilon value to prevent the standard deviation from becoming zero
epochs = 200 #<< 100
learning_rate = 0.001
weight_decay  = 0.01

showC('Hyperparameters defined')

class NeuralNetClassifier(nn.Module):
    def __init__(self, r_size, v_size, num_classes, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout):
        super(NeuralNetClassifier, self).__init__()
        
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(r_size * v_size, hidden_size))
        self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
        for _ in range(num_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.hidden_layers.append(nn.BatchNorm1d(hidden_size))
        
        self.output_layer = nn.Linear(hidden_size, num_classes)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
            x = self.relu(x)
            x = self.dropout(x)
        
        x = self.output_layer(x)
        return x
    
classifier = NeuralNetClassifier(seq_len, d_model, num_classes, hidden_size, num_layers, dropout)
classifier = classifier.to(device)

print(classifier)

Cell complete: Hyperparameters defined
NeuralNetClassifier(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=30000, out_features=2048, bias=True)
    (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=2048, out_features=2048, bias=True)
    (7): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Linear(in_features=2048, out_features=2048, bias=True)
    (9): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=2048, out_features=2048, bias=True)
    (11): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, tr

In [6]:
# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizer

optimizer = optim.AdamW(classifier.parameters(), 
                        lr = learning_rate, weight_decay = weight_decay)

DEV = True

# Training loop
losses = {} #<< track losses
for epoch in range(epochs):
    for inputs, targets in train_loader : 

        optimizer.zero_grad()

        inputs.to(device)
        targets = targets.to(device) - 1  # Convert ratings from [1, 5] to [0, 4]
        inputs = torch.reshape(inputs, (inputs.size(0), -1)).to(device) # get current batch size

        outputs = classifier(inputs).to(device)

        # output is a 32 x 6 tensor of floats,
        # targets will be a 32 x 1 tensor of ints
        loss = criterion(outputs.to(device), targets.to(device).long())
        loss.backward(retain_graph = True)

        optimizer.step()

    losses[loss.item()] = epoch + 1

    printv(f'Epoch [{epoch + 1} / {epochs}] Loss: {loss.item()}')

if VERBOSE:
    printv(f'Last loss: Epoch [{epoch + 1} / {epochs}] Loss: {loss.item()}')
    smallest_losses = sorted(list(losses.keys()))
    printv('Smallest losses')
    for idx in range(3):
        l = smallest_losses[idx]
        printv(f'    Loss: {l}, epoch = {losses[l]}')
showC(f'training complete')

VERBOSE: Epoch [1 / 200] Loss: 1.5971527099609375
VERBOSE: Epoch [2 / 200] Loss: 1.6561734676361084
VERBOSE: Epoch [3 / 200] Loss: 1.697770118713379
VERBOSE: Epoch [4 / 200] Loss: 1.6198837757110596
VERBOSE: Epoch [5 / 200] Loss: 1.6218584775924683
VERBOSE: Epoch [6 / 200] Loss: 1.6017833948135376
VERBOSE: Epoch [7 / 200] Loss: 1.6059505939483643
VERBOSE: Epoch [8 / 200] Loss: 1.6104587316513062
VERBOSE: Epoch [9 / 200] Loss: 1.604677438735962
VERBOSE: Epoch [10 / 200] Loss: 1.5903916358947754
VERBOSE: Epoch [11 / 200] Loss: 1.6307064294815063
VERBOSE: Epoch [12 / 200] Loss: 1.6124794483184814
VERBOSE: Epoch [13 / 200] Loss: 1.5857295989990234
VERBOSE: Epoch [14 / 200] Loss: 1.575007677078247
VERBOSE: Epoch [15 / 200] Loss: 1.5454907417297363
VERBOSE: Epoch [16 / 200] Loss: 1.6017513275146484
VERBOSE: Epoch [17 / 200] Loss: 1.5645109415054321
VERBOSE: Epoch [18 / 200] Loss: 1.3751521110534668
VERBOSE: Epoch [19 / 200] Loss: 1.3982983827590942
VERBOSE: Epoch [20 / 200] Loss: 1.547610402

In [7]:
# Put model in evaluation mode
classifier.eval() 

# Tracking variables
predictions = []
actuals = []

# Evaluate on validation set
with torch.no_grad():
    for inputs, targets in val_loader:
        inputs = inputs.reshape(inputs.shape[0], -1).to(device)
        targets = targets.to(device) - 1  # Convert ratings from [1, 5] to [0, 4]

        outputs = classifier(inputs)
        _, predicted = torch.max(outputs, 1)

        predictions.extend(predicted.tolist())
        actuals.extend(targets.tolist())

# Calculate evaluation metrics
accuracy = accuracy_score(actuals, predictions)
precision = precision_score(actuals, predictions, average='weighted')
recall = recall_score(actuals, predictions, average='weighted')
f1 = f1_score(actuals, predictions, average='weighted')

# Print evaluation metrics
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

# Calculate confusion matrix
cm = confusion_matrix(actuals, predictions)
print("Confusion Matrix:")
print(cm)

# Assess bias and variance
if accuracy < 0.7:  # Adjust the threshold as per your requirements
    print("The model may have high bias (underfitting). Consider increasing model complexity.")
elif accuracy > 0.95:  # Adjust the threshold as per your requirements
    print("The model may have high variance (overfitting). Consider regularization techniques.")
else:
    print("The model seems to have a good balance between bias and variance.")

Validation Accuracy: 0.5354
Validation Precision: 0.5568
Validation Recall: 0.5354
Validation F1-score: 0.5351
Confusion Matrix:
[[3648  847  349  584  172]
 [ 990 3016  472  913  209]
 [ 641  781 2382 1446  350]
 [ 500  448  413 3430  809]
 [ 677  251  214 1943 2515]]
The model may have high bias (underfitting). Consider increasing model complexity.
