In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('IMDB Dataset.csv')
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [7]:
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
%pip install nltk



In [8]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [11]:
def preprocess_text(text):
    # Remove punctuation and lowercase
    import re
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Tokenize and remove stop words
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the data
X = tfidf.fit_transform(df['cleaned_review']).toarray()

# Encode the labels
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [13]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")

Accuracy: 0.8873
Precision: 0.8780440664862775
Recall: 0.901567771383211
F1 Score: 0.8896504455106237


## Feature Engineering for Naive Bayes Sentiment Analysis
#### Objectives
- Implement text vectorization for Naive Bayes
- Calculate and interpret cosine similarity
- Prepare features for model training
- Document findings for technical report

In [1]:
# Set Up All necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import pickle

# Scikit-learn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# For sparse matrix operations
import scipy.sparse as sp

# Set random seed for reproducibility
np.random.seed(42)

In [15]:
# Verify we have the preprocessed data
print("üìä Checking preprocessed data...")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"First cleaned review: {df['cleaned_review'].iloc[0][:100]}...")

# Check sentiment distribution
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())

üìä Checking preprocessed data...
DataFrame shape: (50000, 3)
Columns: ['review', 'sentiment', 'cleaned_review']
First cleaned review: one reviewer mentioned watching 1 oz episode youll hooked right exactly happened mebr br first thing...

Sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [16]:
print("üéØ Splitting data into 80% train, 20% test...")

# Split features and target
X = df['cleaned_review']
y = df['sentiment'].map({'positive': 1, 'negative': 0})

# Perform stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,
    shuffle=True
)

print(f"‚úÖ Split complete!")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Save the splits
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})
train_df.to_csv('train_split.csv', index=False)
test_df.to_csv('test_split.csv', index=False)

üéØ Splitting data into 80% train, 20% test...
‚úÖ Split complete!
Training samples: 40000
Testing samples: 10000


In [17]:
print("üîß Creating Bag-of-Words features...")

bow_vectorizer = CountVectorizer(
    max_features=5000,
    min_df=2,
    max_df=0.8,
    stop_words='english',
    ngram_range=(1, 1)
)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

print(f"‚úÖ BOW features created")
print(f"   Vocabulary size: {len(bow_vectorizer.get_feature_names_out())}")

üîß Creating Bag-of-Words features...
‚úÖ BOW features created
   Vocabulary size: 5000


In [18]:
print("üìê Calculating cosine similarity...")

bow_vocab = bow_vectorizer.get_feature_names_out()
X_train_bow_matrix = X_train_bow

# Select sentiment words
sentiment_words = ['good', 'bad', 'great', 'terrible', 'excellent', 'awful', 'love', 'hate']
available_words = [word for word in sentiment_words if word in bow_vocab]

print(f"Found {len(available_words)} sentiment words in vocabulary")

if len(available_words) >= 2:
    # Get indices and vectors
    word_indices = [np.where(bow_vocab == word)[0][0] for word in available_words]
    word_vectors = X_train_bow_matrix[:, word_indices].T.toarray()
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(word_vectors)
    
    # Save results
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=available_words,
        columns=available_words
    )
    similarity_df.to_csv('cosine_similarity_results.csv')
    print(f"‚úÖ Saved results to cosine_similarity_results.csv")
else:
    print("‚ö†Ô∏è Not enough sentiment words found")

üìê Calculating cosine similarity...
Found 8 sentiment words in vocabulary
‚úÖ Saved results to cosine_similarity_results.csv


In [19]:
print("üíæ Saving features...")

output_dir = Path('feature_engineering_output')
output_dir.mkdir(exist_ok=True)

# Save BOW features
sp.save_npz(output_dir / 'bow_X_train.npz', X_train_bow)
sp.save_npz(output_dir / 'bow_X_test.npz', X_test_bow)

# Save vectorizer
with open(output_dir / 'bow_vectorizer.pkl', 'wb') as f:
    pickle.dump(bow_vectorizer, f)

# Save labels
np.save(output_dir / 'y_train.npy', y_train.values)
np.save(output_dir / 'y_test.npy', y_test.values)

print("‚úÖ All features saved!")

üíæ Saving features...
‚úÖ All features saved!
