In [1]:
# https://www.kaggle.com/datasets/bittlingmayer/amazonreviews

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import spacy
import numpy as np
from collections import Counter

# from torchtext.data import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator
# import nltk
# nltk.download('punkt')

In [2]:
# Load the small English model
nlp = spacy.load("en_core_web_sm")

In [3]:
def create_dataframe_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    data = []
    for line in lines:
        label, text = line.split(' ', 1)
        label_number = int(label.replace('__label__', ''))
        data.append({'label': label_number, 'text': text.strip()})

    return pd.DataFrame(data)

# Create dataframes for the training and testing data
train_df = create_dataframe_from_file('data/train.ft.txt')
test_df = create_dataframe_from_file('data/test.ft.txt')

In [4]:
train_df.head()

Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
train_df['label'] = train_df['label'] - 1
test_df['label'] = test_df['label'] - 1
train_df['label'].value_counts(), test_df['label'].value_counts()

(label
 1    1800000
 0    1800000
 Name: count, dtype: int64,
 label
 1    200000
 0    200000
 Name: count, dtype: int64)

In [6]:
sampled_train_df = train_df.sample(n=100000, random_state=42)
sampled_test_df = test_df.sample(n=20000, random_state=42)

In [7]:
# 1. Preprocess the text data
# Define the tokenizer
def tokenizer(text):
    return [token.text for token in nlp(text)]

# Tokenize the input text
def tokenize_iterator(data_iter):
    for text in data_iter:
        yield tokenizer(text)

def build_vocab(data_iter, min_freq=10, specials=['<unk>', '<pad>']):
    counter = Counter()
    for text in data_iter:
        tokens = tokenizer(text)
        counter.update(tokens)

    word_to_index = {word: i + len(specials) for i, (word, _) in enumerate(counter.most_common()) if word not in specials and counter[word] >= min_freq}
    for i, special in enumerate(specials):
        word_to_index[special] = i
    return word_to_index

vocab = build_vocab(sampled_train_df['text'])

# Define the text_pipeline function
def text_pipeline(text):
    return [vocab.get(token, vocab['<unk>']) for token in tokenizer(text)]

def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(_label)
        text_list.append(torch.tensor(text_pipeline(_text)))  # Convert list to tensor
    return torch.tensor(label_list, dtype=torch.int64), nn.utils.rnn.pad_sequence(text_list, padding_value=vocab['<pad>'], batch_first=True)


In [8]:
# 2. Use train_df for training data and test_df for validation data
train_data, val_data = sampled_train_df, sampled_test_df

# 3. Create a PyTorch Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

train_dataset = TextDataset(train_data)
val_dataset = TextDataset(val_data)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

In [9]:
# Set the device to 'cuda' if available, else 'cpu'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 4. Define an LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return out

vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 256
num_classes = 3

model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_classes).to(device)

# Move the model to the appropriate device
model = model.to(device)

In [10]:
print("Unique labels in train_data:", np.unique(train_df["label"]))
print("Unique labels in test_data:", np.unique(test_df["label"]))


Unique labels in train_data: [0 1]
Unique labels in test_data: [0 1]


In [11]:
# Create a DataLoader for the test dataset
test_data = TextDataset(test_df)
test_dataloader = DataLoader(test_data, batch_size=64, collate_fn=collate_batch)

In [12]:
# 5. Train the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    
    # Training
    for labels, texts in train_dataloader:
        labels, texts = labels.to(device), texts.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()

    train_loss /= len(train_data)
    train_acc /= len(train_data)
    
    # Switch the model to evaluation mode
    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    
    # Calculate the test loss and accuracy during evaluation
    with torch.no_grad():
        for labels, texts in val_dataloader:
            labels, texts = labels.to(device), texts.to(device)
            
            outputs = model(texts)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            test_acc += (outputs.argmax(1) == labels).sum().item()
    
    test_loss /= len(val_data)
    test_acc /= len(val_data)

    # Print or store the test results
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
    
    # Switch the model back to training mode
    model.train()


Epoch 1/10, Train Loss: 0.0109, Train Acc: 0.5006, Test Loss: 0.0108, Test Acc: 0.4988
Epoch 2/10, Train Loss: 0.0075, Train Acc: 0.7516, Test Loss: 0.0053, Test Acc: 0.8568
Epoch 3/10, Train Loss: 0.0044, Train Acc: 0.8880, Test Loss: 0.0046, Test Acc: 0.8841
Epoch 4/10, Train Loss: 0.0036, Train Acc: 0.9116, Test Loss: 0.0043, Test Acc: 0.8944
Epoch 5/10, Train Loss: 0.0031, Train Acc: 0.9244, Test Loss: 0.0042, Test Acc: 0.8990
Epoch 6/10, Train Loss: 0.0028, Train Acc: 0.9336, Test Loss: 0.0042, Test Acc: 0.9010
Epoch 7/10, Train Loss: 0.0025, Train Acc: 0.9414, Test Loss: 0.0041, Test Acc: 0.9003
Epoch 8/10, Train Loss: 0.0023, Train Acc: 0.9472, Test Loss: 0.0041, Test Acc: 0.9036
Epoch 9/10, Train Loss: 0.0022, Train Acc: 0.9511, Test Loss: 0.0044, Test Acc: 0.8984
Epoch 10/10, Train Loss: 0.0021, Train Acc: 0.9533, Test Loss: 0.0044, Test Acc: 0.9012
