In [3]:
# 📌 Cell 1: Import Libraries
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [4]:
# 📌 Cell 2: Load IMDb Dataset
# Update path if needed
dataset_path = 'aclImdb/train'  # <-- Change this if it's in a different location

data = load_files(dataset_path, categories=['pos', 'neg'], encoding='utf-8')
X, y = data.data, data.target

print(f"Loaded {len(X)} reviews: {sum(y==1)} positive, {sum(y==0)} negative")


Loaded 22146 reviews: 11036 positive, 11110 negative


In [5]:
# 📌 Cell 3: Vectorize with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_vect = vectorizer.fit_transform(X)

# Split into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_vect, y, test_size=0.2, random_state=42)


In [6]:
# 📌 Cell 4: Train Model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)


In [7]:
# 📌 Cell 5: Evaluate Model
print("✅ Accuracy:", accuracy_score(y_val, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_val, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_val, y_pred))


✅ Accuracy: 0.8848758465011287

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89      2268
           1       0.87      0.90      0.88      2162

    accuracy                           0.88      4430
   macro avg       0.89      0.89      0.88      4430
weighted avg       0.89      0.88      0.88      4430


🧮 Confusion Matrix:
 [[1973  295]
 [ 215 1947]]


In [8]:
# 📌 Cell 6: Top Predictive Words
feature_names = vectorizer.get_feature_names_out()
coeffs = model.coef_[0]

# Top 10 words for each class
top_pos = np.argsort(coeffs)[-10:]
top_neg = np.argsort(coeffs)[:10]

print("🔝 Top Positive Words:")
print([feature_names[i] for i in top_pos])

print("\n🔻 Top Negative Words:")
print([feature_names[i] for i in top_neg])


🔝 Top Positive Words:
['superb', 'loved', 'fun', 'amazing', 'favorite', 'wonderful', 'perfect', 'best', 'excellent', 'great']

🔻 Top Negative Words:
['worst', 'bad', 'waste', 'awful', 'boring', 'poor', 'worse', 'terrible', 'dull', 'unfortunately']


In [9]:
test_data = load_files('aclImdb/test', categories=['pos', 'neg'], encoding='utf-8')
X_test = vectorizer.transform(test_data.data)
y_test = test_data.target

y_test_pred = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Test Accuracy: 0.8731923312449951
