In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, Dataset

from collections import Counter

In [4]:
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
torch.manual_seed(1);

In [6]:
word_counter = Counter();

In [7]:
file = "../Reviews.csv";

df = pd.read_csv(file);

In [8]:
df = df[:int(len(df)/10)]; ## Limit the rows down to 1/10 of the entire dataset. Total of 56845 rows.

In [None]:
def text_cleansing(text):
    # Convert the words in lowercases.
    text = text.lower();

    # Remove punctuations and special characters.
    text = re.sub(r"[^a-zA-Z\s]", "", text);

    return text;

In [None]:
def stopword_cleansing(text):
    # Cleanse the Text
    cleaned_text = text_cleansing(text)

    # Remove Tokenize the cleaned text
    tokens = word_tokenize(cleaned_text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    return filtered_tokens

In [9]:
# Ensure 'Text' column exists
if 'Text' in df.columns:
    # Fill any missing values in 'Text' with empty strings
    df['Text'] = df['Text'].fillna('')

    # Ensure all values in 'Text' are strings
    df['Text'] = df['Text'].astype(str)

    # Clean the words using custom function
    df['tokenized'] = df['Text'].apply(lambda x: stopword_cleansing(x))

    # # Apply tokenization
    # df["tokenized"] = df["Text"].apply(word_tokenize)

else:
    raise KeyError("'Text' column not found in DataFrame")

In [10]:
# Indexing and Numericalization
word_counter = Counter()
for tokens in df['tokenized']:
    word_counter.update(tokens)

In [11]:
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counter.most_common())}
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

In [12]:
df['numericalized'] = df['tokenized'].apply(lambda x: [vocab.get(token, vocab['<UNK>']) for token in x])

In [13]:
# Padding
max_len = max(map(len, df['numericalized']))
df['padded'] = df['numericalized'].apply(lambda x: x + [vocab['<PAD>']] * (max_len - len(x)))

In [14]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        super(CustomDataset, self).__init__();
        self.X = X;
        self.y = y;

    def __len__(self):
        return len(self.X);

    def __getitem__(self, index):
        return self.X[index], self.y[index];

In [15]:
df['Score'] = df['Score'] - 1
# Load the data, padded versions.
X = torch.tensor(df["padded"].tolist());
y = torch.tensor(df["Score"].tolist()).long();

In [16]:
dataset = CustomDataset(X, y);
n = len(dataset);

In [17]:
batch_size = 100;
train_size = int(0.75 * n);
validation_size = int(0.15 * n);
test_size = n - train_size - validation_size;

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, validation_size, test_size])

# Create DataLoader for each train, validation, and test datasets.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4);
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [18]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, bidirectional = True, num_layers = 2, dropout = 0.2):
        super(Model, self).__init__();
        # self.embedding = nn.Embedding(vocab_size, embedding_dim);
        # self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True);
        # self.dropout = nn.Dropout(dropout);
        # self.fc = nn.Linear(hidden_dim, output_dim);
        # self.batch_norm = nn.BatchNorm1d(hidden_dim);

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2 if bidirectional else hidden_dim)
        self.fc1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # embedded = self.embedding(x);
        # lstm_out, _ = self.lstm(embedded);
        # lstm_out = self.dropout(lstm_out);
        # lstm_out = self.batch_norm(lstm_out);
        # # out = self.fc(lstm_out[:, -1, :]);
        # out = self.fc(lstm_out);
        # out = self.dropout(out);

        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        
        # Concatenate the outputs from both directions if bidirectional
        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.batch_norm(lstm_out)
        out = self.fc1(lstm_out)
        out = self.dropout(out)
        out = self.fc2(out)
        
        return out;

In [19]:
# Defining hyperparameters
vocab_size = len(vocab);
embedding_dim = 100;
hidden_dim = 128;
output_dim = 5;

In [20]:
model = Model(vocab_size, embedding_dim, hidden_dim, output_dim);

In [21]:
criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001);

In [22]:
num_epochs = 10;

for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}')
    model.train()  # Train the model
    total_loss = 0  # Initialize the Loss to 0

    for batch_idx, (inputs, labels) in enumerate(train_dataloader):
        print(f'Processing batch {batch_idx+1}')
        optimizer.zero_grad()  # Clearing out the Gradient Descent

        # Forward pass
        inputs = inputs.to(torch.int64)
        labels = labels.to(torch.int64)
        outputs = model(inputs)

        # Debugging shapes and types
        print(f'Outputs shape: {outputs.shape}, Labels shape: {labels.shape}')
        print(f'Outputs dtype: {outputs.dtype}, Labels dtype: {labels.dtype}')

        # Calculate the loss
        try:
            loss = criterion(outputs, labels)
            print(f'Loss: {loss.item()}')
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        except Exception as e:
            print(f'Error in loss calculation: {e}')
            break  # Exit the loop if there's an error in loss calculation

        total_loss += loss.item();

    # Print average loss for each epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}');

Starting epoch 1
