In [None]:
# 1) Mount Google Drive and check for available GPU
from google.colab import drive
drive.mount('/content/drive')

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device:", device)
if device.type == "cuda":
    print("GPU detected:", torch.cuda.get_device_name(0))
else:
    print("No GPU available; using CPU.")

# 2) Install required libraries
!pip install nltk tqdm bs4

# 3) Import libraries
import re
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

# 4) Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# 5) Load the IMDB dataset
csv_path = '/content/drive/MyDrive/IMDB_Dataset.csv'
df = pd.read_csv(csv_path)
print("Dataset shape (rows × columns):", df.shape)
print("Sentiment distribution:\n", df['sentiment'].value_counts())
df.head()

# 6) Define text preprocessing function
stemmer = SnowballStemmer('english')
stops = set(stopwords.words('english'))

def preprocess_text(text):
    """
    1. Remove HTML tags
    2. Remove non-alphabet characters and convert to lowercase
    3. Tokenize into words
    4. Remove stopwords and apply stemming
    5. Rejoin tokens into a single string
    """
    # 1) Strip HTML
    text = BeautifulSoup(text, 'html.parser').get_text()
    # 2) Remove non-letters & lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # 3) Tokenize
    tokens = nltk.word_tokenize(text)
    # 4) Remove stopwords and stem
    tokens = [stemmer.stem(tok) for tok in tokens if tok not in stops and len(tok) > 1]
    # 5) Join tokens back into a string
    return ' '.join(tokens)

# 7) Apply preprocessing to all reviews
tqdm.pandas()
df['cleaned'] = df['review'].progress_map(preprocess_text)
df[['review','cleaned']].head()

# 8) Feature extraction
# a) Bag-of-Words
cv = CountVectorizer(max_features=10000)
X_bow = cv.fit_transform(df['cleaned'])
# b) TF–IDF (optional)
tfidf = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf.fit_transform(df['cleaned'])

# 9) Encode target labels
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# 10) Split into training and test sets (using BoW features)
X_train, X_test, y_train, y_test = train_test_split(
    X_bow, y, test_size=0.2, random_state=42
)

# 11) Train & evaluate Gaussian Naive Bayes on BoW
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test.toarray())

print("\n=== Gaussian Naive Bayes (BoW) Results ===")
print("Accuracy: ", accuracy_score(y_test, y_pred_gnb))
print("ROC AUC:  ", roc_auc_score(y_test, y_pred_gnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))
print(classification_report(y_test, y_pred_gnb, target_names=['negative', 'positive']))

# 12) Train & evaluate Random Forest on BoW
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n=== Random Forest (BoW) Results ===")
print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:  ", roc_auc_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=['negative', 'positive']))

# 13) Repeat training & evaluation using TF–IDF features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

# --- Gaussian Naive Bayes on TF–IDF ---
gnb_tfidf = GaussianNB()
gnb_tfidf.fit(X_train_tfidf.toarray(), y_train_tfidf)
y_pred_gnb_tfidf = gnb_tfidf.predict(X_test_tfidf.toarray())

print("\n=== Gaussian Naive Bayes (TF–IDF) Results ===")
print("Accuracy: ", accuracy_score(y_test_tfidf, y_pred_gnb_tfidf))
print("ROC AUC:  ", roc_auc_score(y_test_tfidf, y_pred_gnb_tfidf))
print("Confusion Matrix:\n", confusion_matrix(y_test_tfidf, y_pred_gnb_tfidf))
print(classification_report(y_test_tfidf, y_pred_gnb_tfidf, target_names=['negative', 'positive']))

# --- Random Forest on TF–IDF ---
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

print("\n=== Random Forest (TF–IDF) Results ===")
print("Accuracy: ", accuracy_score(y_test_tfidf, y_pred_rf_tfidf))
print("ROC AUC:  ", roc_auc_score(y_test_tfidf, y_pred_rf_tfidf))
print("Confusion Matrix:\n", confusion_matrix(y_test_tfidf, y_pred_rf_tfidf))
print(classification_report(y_test_tfidf, y_pred_rf_tfidf, target_names=['negative', 'positive']))

# 14) Interactive prediction example
user_review = input("Please enter a movie review: ")
cleaned_review = preprocess_text(user_review)
vectorized_review = cv.transform([cleaned_review])
prediction = rf.predict(vectorized_review)[0]
print("Predicted sentiment:", 'positive' if prediction == 1 else 'negative')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Running on device: cpu
No GPU available; using CPU.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset shape (rows × columns): (50000, 2)
Sentiment distribution:
 sentiment
positive    25000
negative    25000
Name: count, dtype: int64


100%|██████████| 50000/50000 [02:26<00:00, 342.07it/s]



=== Gaussian Naive Bayes (BoW) Results ===
Accuracy:  0.6882
ROC AUC:   0.6895336312261238
Confusion Matrix:
 [[4269  692]
 [2426 2613]]
              precision    recall  f1-score   support

    negative       0.64      0.86      0.73      4961
    positive       0.79      0.52      0.63      5039

    accuracy                           0.69     10000
   macro avg       0.71      0.69      0.68     10000
weighted avg       0.71      0.69      0.68     10000


=== Random Forest (BoW) Results ===
Accuracy:  0.8464
ROC AUC:   0.8464093755464082
Confusion Matrix:
 [[4205  756]
 [ 780 4259]]
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


=== Gaussian Naive Bayes (TF–IDF) Results ===
Accuracy:  0.7578
ROC AUC:   0