In [1]:
import pandas as pd

# Load CSVs after extraction
train_df = pd.read_csv(r"D:\Machine Learning\Recurrent Neural Network\data\archive\train.csv")
valid_df = pd.read_csv(r"D:\Machine Learning\Recurrent Neural Network\data\archive\valid.csv")
test_df  = pd.read_csv(r"D:\Machine Learning\Recurrent Neural Network\data\archive\test.csv")

# Preview
print(train_df.head())

                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1


In [2]:
def simple_tokenizer(text):
    return text.lower().split()

In [3]:
from collections import Counter

# Count word frequencies
counter = Counter()
for text in train_df['text']:
    counter.update(simple_tokenizer(text))

# Build vocab dictionary (top 20k words)
vocab_size = 20000
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common(vocab_size))}
vocab["<pad>"] = 0
vocab["<unk>"] = 1

In [4]:
import torch
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, df, vocab, tokenizer, maxlen=200):
        self.texts = df['text'].tolist()
        self.labels = df['label'].tolist()
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        ids = [self.vocab.get(token, self.vocab["<unk>"]) for token in tokens][:self.maxlen]
        if len(ids) < self.maxlen:
            ids += [self.vocab["<pad>"]] * (self.maxlen - len(ids))
        return torch.tensor(ids), torch.tensor(self.labels[idx])

In [5]:
from torch.utils.data import DataLoader

train_dataset = IMDBDataset(train_df, vocab, simple_tokenizer)
valid_dataset = IMDBDataset(valid_df, vocab, simple_tokenizer)
test_dataset  = IMDBDataset(test_df, vocab, simple_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader  = DataLoader(test_dataset, batch_size=64)

In [6]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentLSTM(len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device).float().unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Loss: 0.6927
Epoch 2, Loss: 0.6908
Epoch 3, Loss: 0.5155
Epoch 4, Loss: 0.2973
Epoch 5, Loss: 0.2163


In [8]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device).float().unsqueeze(1)
        preds = (model(X) >= 0.5).int()
        correct += (preds == y.int()).sum().item()
        total += y.size(0)

print(f"Test Accuracy: {100 * correct/total:.2f}%")

Test Accuracy: 86.44%


In [9]:
import pickle

# Save vocab dictionary to file
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

In [9]:
torch.save(model.state_dict(), "sentiment_model.pth")

In [10]:
def predict_sentiment(text):
    tokens = simple_tokenizer(text)
    ids = [vocab.get(token, vocab["<unk>"]) for token in tokens][:200]
    if len(ids) < 200:
        ids += [vocab["<pad>"]] * (200 - len(ids))
    X = torch.tensor([ids]).to(device)
    with torch.no_grad():
        output = model(X)
        return "Positive" if output.item() >= 0.5 else "Negative"

print(predict_sentiment("This movie was amazing!"))
print(predict_sentiment("I hated every minute of it."))

Positive
Negative
