In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torch.nn as nn
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re
import nltk
from nltk import word_tokenize
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import torch

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stemmer=SnowballStemmer('english')

In [None]:
df = pd.read_csv('//content//IMDB Dataset.csv')
reviews=[]
sentiments=[]
for i in range(len(df['review'])):
    reviews.append(df["review"][i])
    
sentiments = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(texts, max_length=300):
    processed = []
    for text in texts:
        text = re.sub(r'<br\\s*/?>', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text.lower())
        tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
        processed.append(tokens[:max_length])
    return processed

In [None]:
tokenized_reviews = preprocess(reviews)

In [None]:
vocabulary = {'<unk>': 0}
for tokens in tokenized_reviews:
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [None]:
def text_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

In [None]:
review_tensors = [torch.tensor(text_to_indices(r, vocabulary), dtype=torch.long) for r in tokenized_reviews]
padded_reviews = pad_sequence(review_tensors, batch_first=True, padding_value=0)
review_lengths = torch.tensor([len(r) for r in review_tensors])
labels = torch.tensor(sentiments, dtype=torch.float32)

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data, labels, lengths):
        self.data = data
        self.labels = labels
        self.lengths = lengths
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx], self.lengths[idx]


In [None]:
dataset = ReviewDataset(padded_reviews, labels, review_lengths)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        hn = self.dropout(hn[-1])
        return self.fc(hn).squeeze()

In [None]:
#model = LSTMModel(len(vocabulary))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(len(vocabulary)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [None]:
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for xb, yb, lens in train_dl:
        xb, yb, lens = xb.to(device), yb.to(device), lens.to(device)
        optimizer.zero_grad()
        out = model(xb, lens)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = (torch.sigmoid(out) > 0.5).float()
        correct += (preds == yb).sum().item()
        total += yb.size(0)
    print(f"Epoch {epoch+1}: Train Loss = {total_loss:.4f}, Accuracy = {correct/total:.4f}")



In [None]:
   # Validation
model.eval()
val_loss, val_correct, val_total = 0, 0, 0
with torch.no_grad():
    for xb, yb, lens in val_dl:
        xb, yb, lens = xb.to(device), yb.to(device), lens.to(device)
        out = model(xb, lens)
        val_loss += criterion(out, yb).item()
        preds = (torch.sigmoid(out) > 0.5).float()
        val_correct += (preds == yb).sum().item()
        val_total += yb.size(0)
print(f"         Val Loss = {val_loss:.4f}, Accuracy = {val_correct/val_total:.4f}")

In [None]:
def predict_sentiment(text, model, vocab, device, max_len=300):
    model.eval()
    text = re.sub(r'<br\\s*/?>', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w not in stop_words]
    tokens = tokens[:max_len]

    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
    tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    length = torch.tensor([len(indices)]).to(device)

    with torch.no_grad():
        output = model(tensor, length)
        prob = torch.sigmoid(output).item()
        prediction = "Positive" if prob > 0.5 else "Negative"
        print(f"Prediction: {prediction} ({prob:.4f})")

string=str(input("Enter review:"))
predict_sentiment(string, model, vocabulary, device)


In [None]:
'''
Main Problems Identified
1. Model Uses Only the Final Hidden State
python
Copy
Edit
output = self.fc(final_hidden_state.squeeze(0))
You're feeding only the final hidden state into the output layer. This can work, but:

If the input is padded, the final hidden state may correspond to padding.

It ignores rich intermediate information from the sequence.

✅ Fix: Use intermediate_hidden_states[:, -1, :] to take the last non-padded output instead. Or better: use attention or mean pooling over outputs.

2. Padding Not Masked in LSTM
The LSTM doesn't know which parts are padding. It treats all time steps equally.

✅ Fix: Use pack_padded_sequence:

python
Copy
Edit
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# In forward:
lengths = (x != 0).sum(dim=1)  # assuming 0 is the padding
embedded = self.embedding(x)
packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
_, (final_hidden_state, _) = self.lstm(packed)
output = self.fc(final_hidden_state.squeeze(0))
3. Sentiment Tensor Shape is Wrong
python
Copy
Edit
sentiments.append([data["sentiment"][i]])
You're wrapping each label in an extra list, making shape [N, 1]. Not wrong per se, but awkward and may cause shape mismatches.

✅ Fix: Just append data["sentiment"][i] directly.

4. No Validation Split or Metric
You are training and printing accuracy on the training set only, so:

You're not checking generalization.

The model may overfit or be undertrained and you wouldn't know.

✅ Fix:
Split dataset into train/test using train_test_split and check validation accuracy separately.

5. No Token Limit or Truncation
Some reviews may be very long, and some very short:

This causes inefficient padding (long tail of very long sequences).

Your model has to learn irrelevant zero-padded info.

✅ Fix: Truncate or limit token count, e.g.,:

python
Copy
Edit
MAX_LEN = 300
for sentence in reviews_vectors:
    reviews_vectors.append(sentence[:MAX_LEN])
6. Poor Tokenization and No Cleaning
You're using basic word_tokenize and just lowercasing:

No lemmatization/stemming (Snowball was imported but unused).

No stopword removal.

re.sub is minimal (e.g., punctuation, HTML tags, etc.)

✅ Fix: Improve preprocessing — e.g., remove stopwords, lemmatize, etc.

7. Learning Rate Might Be Too Low
You're using lr=0.001 with Adam. This is okay, but if training loss doesn't decrease, try 0.0005 or 0.0001.

8. Batch Size
You're using batch_size=64 — this is generally fine, but with large padded sequences, it can become memory-heavy.

✅ In Summary — Suggestions
Use pack_padded_sequence with real lengths.

Track validation accuracy and loss separately.

Improve tokenization (stopword removal, lemmatization).

Truncate long reviews.

Consider using all hidden outputs (mean-pooling, attention, etc.).

Add dropout or regularization if model overfits.
'''