In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, Dataset

from collections import Counter

In [2]:
torch.manual_seed(1);

In [3]:
word_counter = Counter();

In [4]:
file = "../archive/Reviews.csv";

df = pd.read_csv(file);

In [5]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [6]:
df["tokenized"] = df["Text"].apply(word_tokenize);

In [7]:
# Indexing and Numericalization
word_counter = Counter()
for tokens in df['tokenized']:
    word_counter.update(tokens)

In [8]:
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counter.most_common())}
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

In [9]:
df['numericalized'] = df['tokenized'].apply(lambda x: [vocab.get(token, vocab['<UNK>']) for token in x])

In [10]:
# Padding
max_len = max(map(len, df['numericalized']))
df['padded'] = df['numericalized'].apply(lambda x: x + [vocab['<PAD>']] * (max_len - len(x)))

In [11]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        super(CustomDataset, self).__init__();
        self.X = X;
        self.y = y;

    def __len__(self):
        return len(self.X);

    def __getitem__(self, index):
        return self.X[index], self.y[index];

In [26]:
df['Score'] = df['Score'] - 1
# Load the data, padded versions.
X = torch.tensor(df["padded"].tolist());
y = torch.tensor(df["Score"].tolist()).long();

In [27]:
dataset = CustomDataset(X, y);
n = len(dataset);

In [28]:
batch_size = 32;
train_size = int(0.75 * n);
validation_size = int(0.15 * n);
test_size = n - train_size - validation_size;

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, validation_size, test_size])

# Create DataLoader for each train, validation, and test datasets.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [29]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers = 1):
        super(Model, self).__init__();
        self.embedding = nn.Embedding(vocab_size, embedding_dim);
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True);
        self.fc = nn.Linear(hidden_dim, output_dim);

    def forward(self, x):
        embedded = self.embedding(x);
        lstm_out, _ = self.lstm(embedded);
        out = self.fc(lstm_out[:, -1, :]);
        return out;

In [34]:
# Defining hyperparameters
vocab_size = len(vocab);
embedding_dim = 100;
hidden_dim = 128;
output_dim = 5;

In [35]:
model = Model(vocab_size, embedding_dim, hidden_dim, output_dim);

In [36]:
criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(model.parameters(), lr=0.001);

In [37]:
num_epochs = 10;

for epoch in range(num_epochs):
    model.train(); # Train the model.
    total_loss = 0; # Initialize the Loss to 0.

    for inputs, labels in train_dataloader:
        optimizer.zero_grad()  # Clearing out the Gradient Descent

        # Forward the loss
        inputs = inputs.to(torch.int64)
        labels = labels.to(torch.int64)
        outputs = model(inputs);

        # Calculate the loss.
        loss = criterion(outputs, labels);

        # Backward pass and optimization.
        loss.backward();
        optimizer.step();

        total_loss += loss.item();

    # Print average loss for each epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}');

KeyboardInterrupt: 

In [10]:
# Shuffle the dataset and obtain random sample from the dataset.
n = len(df);

# Generate and shuffle the indices.
indices = np.arange(n);
indices = np.random.permutation(indices);

# Select the train, validation, and test indices. For this LSTM model, we choose to select 75%, 15%, and 10% segments of the dataset respectively.
train_indices = indices[:int(0.7 * n)];
val_indices = indices[int(0.7 * n):int(0.85 * n)];
test_indices = indices[int(0.9*n):];

train_dataset = torch.utils.data.Subset(df["Text"], train_indices);
val_dataset = torch.utils.data.Subset(df["Text"], val_indices);
test_dataset = torch.utils.data.Subset(df["Text"], test_indices);

<torch.utils.data.dataset.Subset at 0x334951d80>