<a href="https://colab.research.google.com/github/developerabhi14/ML-Notebooks/blob/main/Sentiment%20analysis%20IMDB%20Movie%20Review%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import pandas as pd

In [4]:
df=pd.read_csv("IMDB Dataset.csv")

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
from sklearn.model_selection import train_test_split

# Mapping the dataset labels
df['sentiment']=df['sentiment'].map({"negative":0, "positive":1})
train_df, test_df=train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

In [11]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize text
    return tokens

# Apply cleaning and tokenization
train_df['review'] = train_df['review'].apply(clean_text)
test_df['review'] = test_df['review'].apply(clean_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [12]:
train_df.head()

Unnamed: 0,review,sentiment
47808,"[i, caught, this, little, gem, totally, by, ac...",1
20154,"[i, cant, believe, that, i, let, myself, into,...",0
43069,"[spoiler, alert, it, just, gets, to, me, the, ...",0
19413,"[if, theres, one, thing, ive, learnt, from, wa...",0
13673,"[i, remember, when, this, was, in, theaters, r...",0


In [13]:
# word embedding

import gensim.downloader as api
import numpy as np

word2vec = api.load("word2vec-google-news-300")  # 300-dimensional vectors
embedding_dim = 300  # Word2Vec vector size

def get_embedding(tokens, embedding_dim=300):
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if len(vectors) == 0:
        return np.zeros(embedding_dim)  # Return zero vector if no words found
    return np.mean(vectors, axis=0)  # Average word vectors

# Convert reviews to embeddings
train_df['vector'] = train_df['review'].apply(lambda x: get_embedding(x))
test_df['vector'] = test_df['review'].apply(lambda x: get_embedding(x))



In [14]:
# Create a pytorch dataset
from torch.utils.data import Dataset, DataLoader
class ReviewDataset(Dataset):
  def __init__(self, df):
    self.reviews=torch.tensor(df['vector'].tolist(), dtype=torch.float32)
    self.labels=torch.tensor(df['sentiment'].tolist(), dtype=torch.float32)

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, index):
     return self.reviews[index], self.labels[index]

# Create dataset
train_dataset=ReviewDataset(train_df)
test_dataset=ReviewDataset(test_df)

# Create Dataloder
train_loader=DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=32, shuffle=True)


  self.reviews=torch.tensor(df['vector'].tolist(), dtype=torch.float32)


In [37]:
# Define a neural network model

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class SentimentModel(nn.Module):
    def __init__(self, input_dim):
        super(SentimentModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)  # Increased neurons
        self.bn1 = nn.BatchNorm1d(512)  # Added BatchNorm
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.3)  # Added Dropout
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return self.sigmoid(x)


# class SentimentModel(nn.Module):
#     def __init__(self, input_dim):
#         super(SentimentModel, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 256)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 32)
#         self.fc4 = nn.Linear(32, 1)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         x = self.relu(x)
#         x = self.fc3(x)
#         x = self.relu(x)
#         x = self.fc4(x)
#         return self.sigmoid(x)

# Initialize Model
model = SentimentModel(embedding_dim)
# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [38]:
# Training Loop with Accuracy Evaluation
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    correct_train = 0
    total_train = 0

    model.train()  # Set model to training mode
    for reviews, labels in train_loader:
        labels = labels.float().unsqueeze(1)  # Convert labels to float
        optimizer.zero_grad()
        outputs = model(reviews)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Compute training accuracy
        predicted = (outputs > 0.5).float()
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)

    train_accuracy = (correct_train / total_train) * 100  # Convert to percentage

    # Evaluate on test set
    model.eval()  # Set model to evaluation mode
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for reviews, labels in test_loader:
            labels = labels.float().unsqueeze(1)
            outputs = model(reviews)
            predicted = (outputs > 0.5).float()
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)

    test_accuracy = (correct_test / total_test) * 100  # Convert to percentage

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, "
          f"Train Acc: {train_accuracy:.2f}%, Test Acc: {test_accuracy:.2f}%")


Epoch 1, Loss: 0.3718, Train Acc: 83.75%, Test Acc: 85.03%
Epoch 2, Loss: 0.3406, Train Acc: 85.27%, Test Acc: 86.46%
Epoch 3, Loss: 0.3278, Train Acc: 85.89%, Test Acc: 86.34%
Epoch 4, Loss: 0.3194, Train Acc: 86.28%, Test Acc: 85.74%
Epoch 5, Loss: 0.3113, Train Acc: 86.60%, Test Acc: 86.30%
Epoch 6, Loss: 0.3010, Train Acc: 87.11%, Test Acc: 86.47%
Epoch 7, Loss: 0.2938, Train Acc: 87.26%, Test Acc: 86.70%
Epoch 8, Loss: 0.2876, Train Acc: 87.68%, Test Acc: 86.78%
Epoch 9, Loss: 0.2826, Train Acc: 87.68%, Test Acc: 86.69%
Epoch 10, Loss: 0.2716, Train Acc: 88.27%, Test Acc: 86.39%


Test Accuracy: 0.8564


In [43]:
import torch

def predict_sentiment(model, review_text):
    # Preprocess the review
    tokens = clean_text(review_text)

    # Convert to embedding
    review_vector = get_embedding(tokens)

    # Convert to tensor
    review_tensor = torch.tensor(review_vector, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    # Make prediction
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        output = model(review_tensor)

    # Interpret result
    sentiment = "Positive" if output.item() > 0.5 else "Negative"

    return sentiment, output.item()

# Example review
sample_review = "This movie was abysmal"
sentiment, confidence = predict_sentiment(model, sample_review)
print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.4f})")


Predicted Sentiment: Negative (Confidence: 0.0000)
