In [78]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, Dataset

from collections import Counter

import re

import progressbar

In [79]:
nltk.download('punkt');
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [80]:
torch.manual_seed(1);

In [81]:
word_counter = Counter();

In [82]:
file = "../Reviews.csv";

df = pd.read_csv(file);

In [83]:
df = df[:int(len(df)/10)]; ## Limit the rows down to 1/10 of the entire dataset. Total of 56845 rows.

In [84]:
def text_cleansing(text):
    # Convert the words in lowercases.
    text = text.lower();

    # Remove punctuations and special characters.
    text = re.sub(r"[^a-zA-Z\s]", "", text);

    return text;

In [98]:
def stopword_cleansing(text):
    # Cleanse the Text
    cleaned_text = text_cleansing(text)

    # Remove Tokenize the cleaned text
    tokens = word_tokenize(cleaned_text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    return filtered_tokens

In [86]:
def extract_adjectives(text):
  pos_tags = nltk.pos_tag(text)
  return [word for word, pos in pos_tags if pos in ('JJ', 'JJR', 'JJS')]

In [87]:
# Ensure 'Text' column exists
if 'Text' in df.columns:
    # Fill any missing values in 'Text' with empty strings
    df['Text'] = df['Text'].fillna('')

    # Ensure all values in 'Text' are strings
    df['Text'] = df['Text'].astype(str)

    # Clean the words using custom function
    df['tokenized'] = df['Text'].apply(lambda x: stopword_cleansing(x))

    # We extract the adjectives that are more likely describing the state of emotions
    df['adjectives'] = df['tokenized'].apply(lambda x: extract_adjectives(x))

    # # Apply tokenization
    # df["tokenized"] = df["Text"].apply(word_tokenize)

else:
    raise KeyError("'Text' column not found in DataFrame")

In [88]:
pd.reset_option("display.max_rows")
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,tokenized,adjectives
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,"[bought, several, vitality, canned, dog, food,...","[several, dog, good, finicky]"
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,"[product, arrived, labeled, jumbo, salted, pea...","[labeled, small, unsalted, sure, represent]"
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"[confection, around, centuries, light, pillowy...","[light, tiny, powdered, tiny, mouthful, flavor..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,"[looking, secret, ingredient, robitussin, beli...","[secret, good]"
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,"[great, taffy, great, price, wide, assortment,...","[great, taffy, great, wide, quick]"
...,...,...,...,...,...,...,...,...,...,...,...,...
56840,56841,B000GG0BNO,A2GTMNIXL0MDMC,Kristine Johanek,0,0,5,1318464000,Delicious tea,"Mango Green Tea is my daughter's favorite tea,...","[mango, green, tea, daughters, favorite, tea, ...","[green, available, many, bad, fast, good]"
56841,56842,B000GG0BNO,A70CKQ0ZITPIC,Cbeee,0,0,5,1315958400,I absolutely love this tea,This is my current favorite tea. I do add som...,"[current, favorite, tea, add, stevia, sweeten,...","[current, favorite, stevia, sweeten, green, sl..."
56842,56843,B000GG0BNO,A1IN8OWQ76JMQR,Mary Ferjan,0,0,5,1299369600,"Best Mango tea I have tasted, smooth green tea",This is by far the best mango flavored tea I h...,"[far, best, mango, flavored, tea, tried, bonus...","[best, tried, green, green, smooth, bitterness..."
56843,56844,B000C4MU9I,A1ZD3RIYJIBYLN,Sunny in the Northwest,9,9,5,1189468800,Good chocolate.,The chocolate flavor shines in this bitterswee...,"[chocolate, flavor, shines, bittersweet, choco...","[right, good, worth]"


In [89]:
# Check if there is any null value in tokenized column
df["adjectives"].isnull() # Determined to be False throughout the entire dataframe. Cleaning status is OK.

0        False
1        False
2        False
3        False
4        False
         ...  
56840    False
56841    False
56842    False
56843    False
56844    False
Name: adjectives, Length: 56845, dtype: bool

In [90]:
# Indexing and Numericalization
word_counter = Counter()
for tokens in df['adjectives']:
    word_counter.update(tokens)

In [55]:
print(word_counter.most_common())



In [91]:
# Newly printed most common words
print(word_counter.most_common())



In [92]:
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counter.most_common())}
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

In [93]:
df['numericalized'] = df['adjectives'].apply(lambda x: [vocab.get(token, vocab['<UNK>']) for token in x])

In [94]:
df["numericalized"]

0                                        [48, 110, 2, 417]
1                                [2395, 16, 833, 33, 3761]
2          [81, 149, 653, 149, 1253, 107, 285, 466, 10908]
3                                                 [669, 2]
4                                    [3, 2396, 3, 474, 87]
                               ...                        
56840                              [29, 61, 8, 23, 224, 2]
56841                         [376, 17, 335, 694, 29, 304]
56842    [6, 59, 29, 29, 98, 838, 106, 91, 17, 74, 1058...
56843                                          [55, 2, 68]
56844                         [1531, 497, 963, 19, 43, 68]
Name: numericalized, Length: 56845, dtype: object

In [95]:
# Padding
max_len = max(map(len, df['numericalized']))
df['padded'] = df['numericalized'].apply(lambda x: x + [vocab['<PAD>']] * (max_len - len(x)))

In [96]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        super(CustomDataset, self).__init__();
        self.X = X;
        self.y = y;

    def __len__(self):
        return len(self.X);

    def __getitem__(self, index):
        return self.X[index], self.y[index];

In [97]:
df['Score'] = df['Score'] - 1
# Load the data, padded versions.
X = torch.tensor(df["padded"].tolist());
y = torch.tensor(df["Score"].tolist()).long();

In [16]:
dataset = CustomDataset(X, y);
n = len(dataset);

In [17]:
batch_size = 100;
train_size = int(0.75 * n);
validation_size = int(0.15 * n);
test_size = n - train_size - validation_size;

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, validation_size, test_size])

# Create DataLoader for each train, validation, and test datasets.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4);
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [18]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, bidirectional = True, num_layers = 2, dropout = 0.2):
        super(Model, self).__init__();
        # self.embedding = nn.Embedding(vocab_size, embedding_dim);
        # self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True);
        # self.dropout = nn.Dropout(dropout);
        # self.fc = nn.Linear(hidden_dim, output_dim);
        # self.batch_norm = nn.BatchNorm1d(hidden_dim);

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2 if bidirectional else hidden_dim)
        self.fc1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # embedded = self.embedding(x);
        # lstm_out, _ = self.lstm(embedded);
        # lstm_out = self.dropout(lstm_out);
        # lstm_out = self.batch_norm(lstm_out);
        # # out = self.fc(lstm_out[:, -1, :]);
        # out = self.fc(lstm_out);
        # out = self.dropout(out);

        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        
        # Concatenate the outputs from both directions if bidirectional
        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.batch_norm(lstm_out)
        out = self.fc1(lstm_out)
        out = self.dropout(out)
        out = self.fc2(out)
        
        return out;

In [19]:
# Defining hyperparameters
vocab_size = len(vocab);
embedding_dim = 100;
hidden_dim = 128;
output_dim = 5;

In [20]:
model = Model(vocab_size, embedding_dim, hidden_dim, output_dim);

In [21]:
criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001);

In [22]:
num_epochs = 10;

for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}')
    model.train()  # Train the model
    total_loss = 0  # Initialize the Loss to 0

    for batch_idx, (inputs, labels) in enumerate(train_dataloader):
        print(f'Processing batch {batch_idx+1}')
        optimizer.zero_grad()  # Clearing out the Gradient Descent

        # Forward pass
        inputs = inputs.to(torch.int64)
        labels = labels.to(torch.int64)
        outputs = model(inputs)

        # Debugging shapes and types
        print(f'Outputs shape: {outputs.shape}, Labels shape: {labels.shape}')
        print(f'Outputs dtype: {outputs.dtype}, Labels dtype: {labels.dtype}')

        # Calculate the loss
        try:
            loss = criterion(outputs, labels)
            print(f'Loss: {loss.item()}')
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        except Exception as e:
            print(f'Error in loss calculation: {e}')
            break  # Exit the loop if there's an error in loss calculation

        total_loss += loss.item();

    # Print average loss for each epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}');

Starting epoch 1


In [None]:
def evaluate(dataloader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():  # Disable gradient calculation
        for batch in dataloader:
            inputs, targets = batch  # Adjust depending on your dataset structure
            
            # Move inputs and targets to the appropriate device (CPU/GPU)
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass: compute predicted outputs by passing inputs to the model
            outputs = model(inputs)
            
            # Calculate the loss
            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)  # Accumulate loss
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)
    
    # Calculate average loss and accuracy
    avg_loss = total_loss / total_predictions
    accuracy = correct_predictions / total_predictions
    
    return avg_loss, accuracy

# Assuming 'validation_dataloader' and 'test_dataloader' are already defined
validation_loss, validation_accuracy = evaluate(validation_dataloader)
test_loss, test_accuracy = evaluate(test_dataloader)

print(f"Validation Loss: {validation_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")