In [36]:
# Step 1: Import Required Libraries

import pandas as pd 
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, classification_report 

In [42]:
# Step 2: Load Data

import nltk
from nltk.corpus import movie_reviews

# Load the reviews into a list of dictionaries
reviews_data = []

# 'neg' (negative) and 'pos' (positive) categories
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # Extract the raw text and the category
        reviews_data.append({
            'review': movie_reviews.raw(fileid),
            'sentiment': category # 'neg' or 'pos'
        })

# Define the DataFrame 'df'
df = pd.DataFrame(reviews_data)

print(df.head())

                                              review sentiment
0  plot : two teen couples go to a church party ,...       neg
1  the happy bastard's quick movie review \ndamn ...       neg
2  it is movies like these that make a jaded movi...       neg
3   " quest for camelot " is warner bros . ' firs...       neg
4  synopsis : a mentally unstable man undergoing ...       neg


In [44]:
# Step 2: Preprocess Data

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # 1. Convert the text to lowercase
    text = text.lower()
    
    # Remove HTML tags 
    text = re.sub(r'<[^>]+>', '', text)
    
    # 2. Remove punctuation and characters that are NOT letters (a-z) or numbers (0-9)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # 3. Tokenize the text (split into words)
    tokens = word_tokenize(text)
    
    # 2. Remove stop words (Filter out common words)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

# Apply preprocessing to dataset
df['processed_text'] = df['review'].apply(preprocess_text)

print("\nFirst 3 rows with original and processed text:")
print(df[['review', 'processed_text', 'sentiment']].head(3))


First 3 rows with original and processed text:
                                              review  \
0  plot : two teen couples go to a church party ,...   
1  the happy bastard's quick movie review \ndamn ...   
2  it is movies like these that make a jaded movi...   

                                      processed_text sentiment  
0  plot two teen couples go church party drink dr...       neg  
1  happy bastard quick movie review damn y2k bug ...       neg  
2  movies like make jaded movie viewer thankful i...       neg  


In [45]:
# Step 3: Convert Text Data into Numerical Features

#1a. X will be the feature (processed text)
X = df['processed_text']

#1b. y will be the sentiment. Convert 'neg'/'pos' labels into 0/1 for the model
y = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

#2. Initialize the TF-IDF Vectorizer and set the maximum number of features to 5000 as specified
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

#3. Fit and Transform the data
X_features = tfidf_vectorizer.fit_transform(X)

print(f"Shape of the feature matrix (X_features): {X_features.shape}")
print(f"This represents {X_features.shape[0]} documents and {X_features.shape[1]} features (words).")

Shape of the feature matrix (X_features): (2000, 5000)
This represents 2000 documents and 5000 features (words).


In [46]:
# Step 4: Split Data into Training and Testing Sets

# test_size=0.3 ensures a 70% train / 30% test split
# random_state=42 ensures the split is the same every time you run the code
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42,stratify=y)

print(f"Training set size (70%): {X_train.shape[0]} samples")
print(f"Testing set size (30%): {X_test.shape[0]} samples")

Training set size (70%): 1400 samples
Testing set size (30%): 600 samples


In [47]:
# A dictionary to store the results
results = {}
models = {
    "Multinomial Naive Bayes (MNB)": MultinomialNB(),
    "Logistic Regression (LR)": LogisticRegression(solver='liblinear', random_state=42),
    "Support Vector Machine (SVM)": SVC(kernel='linear', random_state=42), 
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(n_neighbors=5) 
}

print("--- Model Training and Evaluation ---")

for name, model in models.items():
    print(f"\nTraining and Evaluating: {name}")
    
    # 1. Train the model
    model.fit(X_train, y_train)
    
    # 2. Predict on the test set
    y_pred = model.predict(X_test)
    
    # 3. Compute and record accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # 4. Generate classification report
    report = classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)'])
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'Report': report
    }
    
    # Print immediate results
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    print("--- Summary of Model Accuracies ---")
accuracy_summary = {name: data['Accuracy'] for name, data in results.items()}
summary_df = pd.DataFrame(accuracy_summary.items(), columns=['Model', 'Accuracy'])
summary_df['Accuracy'] = summary_df['Accuracy'].map('{:.4f}'.format)
print(summary_df)

--- Model Training and Evaluation ---

Training and Evaluating: Multinomial Naive Bayes (MNB)
Accuracy: 0.8250
Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.81      0.85      0.83       300
Positive (1)       0.84      0.80      0.82       300

    accuracy                           0.82       600
   macro avg       0.83      0.82      0.82       600
weighted avg       0.83      0.82      0.82       600

--- Summary of Model Accuracies ---

Training and Evaluating: Logistic Regression (LR)
Accuracy: 0.8350
Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.85      0.81      0.83       300
Positive (1)       0.82      0.86      0.84       300

    accuracy                           0.83       600
   macro avg       0.84      0.83      0.83       600
weighted avg       0.84      0.83      0.83       600

--- Summary of Model Accuracies ---

Training and Evaluating: Support Vector Machine 