In [1]:
import pandas as pd 
import numpy as np 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from joblib import dump, load

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
# Set random seed for reproducibility
np.random.seed(42)
random_rows = df.sample(n=10)
random_rows

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,positive
9427,Not many television shows appeal to quite as m...,positive
199,The film quickly gets to a major chase scene w...,negative
12447,Jane Austen would definitely approve of this o...,positive
39489,Expectations were somewhat high for me when I ...,negative
42724,I've watched this movie on a fairly regular ba...,positive
10822,For once a story of hope highlighted over the ...,positive
49498,"Okay, I didn't get the Purgatory thing the fir...",positive
4144,I was very disappointed with this series. It h...,negative
36958,The first 30 minutes of Tinseltown had my fing...,negative


In [6]:
df['sentiment']=df['sentiment'].apply(lambda x:1 if x=='positive' else 0)

## Removing < br/> tags

In [7]:
# Remove the "\n" characters from the text column
df['review'] = df['review'].str.replace('<br /><br />', ' ')

In [8]:
# Initialize WordNet lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [9]:
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing and removing stopwords
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [10]:
df['review'] = df['review'].apply(lambda text : preprocess_text(text))

## CountVectorizer VS TfidfVectorizer

### CountVectorizer

In [11]:
# Assuming 'X' contains the text data and 'y' contains the labels (positive/negative)
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

In [12]:
# Convert text data into a matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

In [13]:
# Select the top k features based on chi-square test
k = 5000  # Number of top features to select
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_counts, y_train)

In [14]:
# Scale the selected features using StandardScaler
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_selected)

In [15]:
X_train_scaled.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# Transform the test data using the same vectorizer, selector, and scaler
X_test_counts = vectorizer.transform(X_test)
X_test_selected = selector.transform(X_test_counts)
X_test_scaled = scaler.transform(X_test_selected)

In [17]:
X_test_scaled.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
# Convert sparse matrix to dense array
X_train_dense = X_train_scaled.toarray()

In [19]:
sm = SMOTE(random_state=2)

print("\nClass 1 before Over Sampling --> ", sum(y_train == 1))
print("\nClass 0 before Over Sampling --> ", sum(y_train == 0))

X_train_dense, y_train = sm.fit_resample(X_train_dense, y_train)

print("\nThe shape of X after Over Sampling -->", X_train_dense.shape)
print("\nThe shape of Y after Over Sampling -->", y_train.shape)

print("\nClass 1 after Over Sampling --> ", sum(y_train == 1))
print("\nClass 0 after Over Sampling --> ", sum(y_train == 0))
print("\n")


Class 1 before Over Sampling -->  17411

Class 0 before Over Sampling -->  17589

The shape of X after Over Sampling --> (35178, 5000)

The shape of Y after Over Sampling --> (35178,)

Class 1 after Over Sampling -->  17589

Class 0 after Over Sampling -->  17589




In [20]:
# Train Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_dense, y_train)

In [21]:
# Convert sparse matrix to dense array for X_test
X_test_dense = X_test_selected.toarray()

# Make predictions using Naive Bayes classifier
y_pred = nb_classifier.predict(X_test_dense)

In [22]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.98      0.68      7411
           1       0.86      0.10      0.19      7589

    accuracy                           0.54     15000
   macro avg       0.69      0.54      0.43     15000
weighted avg       0.69      0.54      0.43     15000



## TfidfVectorizer

In [23]:
# Assuming 'X' contains the text data and 'y' contains the labels (positive/negative)
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

In [24]:
# Convert text data into a matrix of token counts
vectorizer = TfidfVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

In [25]:
# Select the top k features based on chi-square test
k = 5000  # Number of top features to select
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_counts, y_train)

In [26]:
# Scale the selected features using StandardScaler
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_selected)

In [27]:
X_train_scaled.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
# Transform the test data using the same vectorizer, selector, and scaler
X_test_counts = vectorizer.transform(X_test)
X_test_selected = selector.transform(X_test_counts)
X_test_scaled = scaler.transform(X_test_selected)

In [32]:
# Convert sparse matrix to dense array
X_train_dense = X_train_scaled.toarray()

In [33]:
sm = SMOTE(random_state=2)

print("\nClass 1 before Over Sampling --> ", sum(y_train == 1))
print("\nClass 0 before Over Sampling --> ", sum(y_train == 0))

X_train_dense, y_train = sm.fit_resample(X_train_dense, y_train)

print("\nThe shape of X after Over Sampling -->", X_train_dense.shape)
print("\nThe shape of Y after Over Sampling -->", y_train.shape)

print("\nClass 1 after Over Sampling --> ", sum(y_train == 1))
print("\nClass 0 after Over Sampling --> ", sum(y_train == 0))
print("\n")


Class 1 before Over Sampling -->  17411

Class 0 before Over Sampling -->  17589

The shape of X after Over Sampling --> (35178, 5000)

The shape of Y after Over Sampling --> (35178,)

Class 1 after Over Sampling -->  17589

Class 0 after Over Sampling -->  17589




In [34]:
# Train Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_dense, y_train)

In [35]:
# Convert sparse matrix to dense array for X_test
X_test_dense = X_test_selected.toarray()

# Make predictions using Naive Bayes classifier
y_pred = nb_classifier.predict(X_test_dense)

In [36]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.98      0.68      7411
           1       0.84      0.12      0.20      7589

    accuracy                           0.54     15000
   macro avg       0.68      0.55      0.44     15000
weighted avg       0.68      0.54      0.44     15000



### The last model performs less worse. so I will save it 

In [37]:
# Save the trained model to a file
dump(nb_classifier, 'naive_bayes_model.joblib')

['naive_bayes_model.joblib']

## Creating a function that takes the comment and returns the sentiment 

In [46]:
def sentimentAnalysis(text ,vectorizer,selector, scaler): 
    text = preprocess_text(text)
    # Convert text data into a matrix of token counts
    text = vectorizer.transform([text])
    text = selector.transform(text)
    # Scale the selected features using StandardScaler
    text = scaler.transform(text)
    # Convert sparse matrix to dense array
    text = text.toarray()
    # Load the saved model from file
    model = load('naive_bayes_model.joblib')
    return model.predict(text)
    

### now let's use it 

In [51]:
text = "This movie was trash. I didn't like it at all"
sentimentAnalysis(text, vectorizer,selector,scaler )

array([0], dtype=int64)

## The task is done 