In [1]:
import numpy as np  # Import numpy for linear algebra operations
import pandas as pd  # Import pandas for data processing and CSV file I/O (e.g., pd.read_csv)
import re  # Import regex module for regular expressions

from nltk.corpus import stopwords  # Import stopwords from nltk.corpus to remove common words
from nltk.tokenize import word_tokenize  # Import word_tokenize from nltk.tokenize for tokenizing words
from nltk.stem import SnowballStemmer  # Import SnowballStemmer from nltk.stem for stemming words

from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer from sklearn.feature_extraction.text to convert text data into a matrix of token counts
from sklearn.model_selection import train_test_split  # Import train_test_split from sklearn.model_selection for splitting the dataset into training and testing sets
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB  # Import Naive Bayes classifiers from sklearn.naive_bayes
from sklearn.metrics import accuracy_score  # Import accuracy_score from sklearn.metrics to evaluate the performance of the model

import pickle  # Import pickle for serializing and de-serializing Python objects


In [2]:
data = pd.read_csv('Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
print(data.shape)

(50000, 2)


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


## No null values, Label encode sentiment to 1(positive) and 0(negative)


In [5]:
data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
data.sentiment.replace('positive', 1, inplace=True)
data.sentiment.replace('negative', 0, inplace=True)
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [7]:
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## STEPS TO CLEAN THE REVIEWS 

## 1. Remove HTML tags


In [8]:
# Regex rule : ‘<.*?>’

def clean(text):
    # Compile a regular expression pattern to match HTML tags
    cleaned = re.compile(r'<.*?>')
    # Substitute the matched HTML tags with an empty string
    return re.sub(cleaned, '', text)

# Apply the clean function to the 'review' column of the data DataFrame
data.review = data.review.apply(clean)
# Display the first cleaned review
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

## 2. Remove special characters

In [9]:
def is_special(text):
    # Initialize an empty string to store the cleaned text
    rem = ''
    # Iterate through each character in the input text
    for i in text:
        # Check if the character is alphanumeric
        if i.isalnum():
            rem = rem + i  # If yes, append it to the rem string
        else:
            rem = rem + ' '  # If no, replace it with a space and append to the rem string
    return rem  # Return the cleaned text

# Apply the is_special function to the 'review' column of the data DataFrame
data.review = data.review.apply(is_special)
# Display the first cleaned review
data.review[0]


'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows wo

## 3. Convert everything to lowercase


In [10]:
def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)
data.review[0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you ll be hooked  they are right  as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence  which set in right from the word go  trust me  this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs  sex or violence  its is hardcore  in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  em city is home to many  aryans  muslims  gangstas  latinos  christians  italians  irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wo

## 4. Remove stopwords


In [11]:
import nltk  # Import nltk to download necessary data
nltk.download('punkt')  # Download the punkt tokenizer data
nltk.download('stopwords')  # Download the stopwords data

def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Get the list of stopwords
    words = word_tokenize(text)  # Tokenize the input text into words
    return [w for w in words if w not in stop_words]  # Remove stopwords from the tokenized words

# Apply the rem_stopwords function to the 'review' column of the data DataFrame
data.review = data.review.apply(rem_stopwords)
# Display the first cleaned review without stopwords
data.review[0]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['one',
 'reviewers',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'da

## 5. Stem the words

In [12]:
def stem_txt(text):
    # Initialize the Snowball Stemmer for English
    ss = SnowballStemmer('english')
    # Join the text list into a string and apply stemming to each word
    return " ".join([ss.stem(w) for w in text])

# Apply the stem_txt function to the 'review' column of the data DataFrame
data.review = data.review.apply(stem_txt)
# Display the first stemmed review
data.review[0]

'one review mention watch 1 oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


## CREATING THE MODEL


## 1. Creating Bag Of Words (BOW)


In [14]:
X = np.array(data.iloc[:,0].values) # Convert the first column of the data DataFrame to a numpy array
y = np.array(data.sentiment.values) # Convert the 'sentiment' column of the data DataFrame to a numpy array

cv = CountVectorizer(max_features=1000) # Initialize CountVectorizer to convert text data to a matrix of token counts
X = cv.fit_transform(data.review).toarray() # Fit the CountVectorizer to the 'review' column and transform it into an array

print("X.shape = ", X.shape) 
print("y.shape = ", y.shape)


X.shape =  (50000, 1000)
y.shape =  (50000,)


In [15]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 2. Train test split


In [16]:
trainx,testx,trainy,testy = train_test_split(X,y,test_size=0.3,random_state=9)

## 3. Defining the models and Training them


In [17]:
gnb = GaussianNB()
mnb = MultinomialNB(alpha=1.0,fit_prior=True)
bnb = BernoulliNB(alpha=1.0,fit_prior=True)
gnb.fit(trainx,trainy)
mnb.fit(trainx,trainy)
bnb.fit(trainx,trainy)

## 4. Prediction and accuracy metrics to choose best model


In [18]:
ypg = gnb.predict(testx) # Predict using the Gaussian Naive Bayes classifier
ypm = mnb.predict(testx) # Predict using the Multinomial Naive Bayes classifier
ypb = bnb.predict(testx) # Predict using the Bernoulli Naive Bayes classifier

print("Gaussian = ", accuracy_score(testy, ypg))
print("Multinomial = ", accuracy_score(testy, ypm))
print("Bernoulli = ", accuracy_score(testy, ypb))

Gaussian =  0.7837333333333333
Multinomial =  0.8292
Bernoulli =  0.8358666666666666
