### Problem Statement

In [3]:
# The goal is to estimate the sentiment many movie reviews from the Internet Movie Database (IMDb).

### Dataset Details

In [1]:
#  Based on the content of the reviews, dataset contains 50,000 movie reviews that have been pre-labeled with 'positive' and 
# 'negative' sentiment class labels."""

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re                                                     # used to remove html tags
import nltk                                                   # natural language tool kit
from nltk.corpus import stopwords                             # used to remove stop words
from nltk.stem.porter import PorterStemmer                    # stemming(Remove the affixes,so that left with stem of that word)
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Importing the Data

In [3]:
df=pd.read_csv(".\Datasets\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [5]:
# Sample 10000 rows
df=df.sample(10000)

In [6]:
df.shape

(10000, 2)

In [7]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [8]:
df.head()

Unnamed: 0,review,sentiment
49136,I just got done watching The Edge of Love (by ...,0
5939,Something somewhere must have terribly gone wr...,0
2787,Geez! This is one of those movies that you thi...,0
14045,First of all I need to say that I'm Portuguese...,1
11325,"I've seen a lot of crap in my day, but goodnes...",0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 49136 to 18960
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


In [10]:
df.describe()

Unnamed: 0,sentiment
count,10000.0
mean,0.5038
std,0.500011
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


### Text Preprocessing

In [14]:
df.iloc[2].review

"This movie is a great movie, however it is, as most movie highly predictable. The greatest highlight of the movie of course is the star character Amanda Bynes, who is absolutely gorgeous and hilarious. She is one of very few people in this world who can use all 53 muscles in her face to make the most strangest and gut-busting faces ever made. It's good for the kids, and contains upper male nudity and suggestive nudity towards the end. All in all, they did a good job updating an old classic, and deserves to rest on the movie stand along with O and 10 Things I Hate About You. The other actors also do a swell job, in many of their first time debuts."

In [15]:
clean=re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

"This movie is a great movie, however it is, as most movie highly predictable. The greatest highlight of the movie of course is the star character Amanda Bynes, who is absolutely gorgeous and hilarious. She is one of very few people in this world who can use all 53 muscles in her face to make the most strangest and gut-busting faces ever made. It's good for the kids, and contains upper male nudity and suggestive nudity towards the end. All in all, they did a good job updating an old classic, and deserves to rest on the movie stand along with O and 10 Things I Hate About You. The other actors also do a swell job, in many of their first time debuts."

In [16]:
# Function to remove html tags

def clean_html(text):
    clean=re.compile('<.*?>')
    return re.sub(clean,'',text)

In [17]:
df['review']=df['review'].apply(clean_html)

In [18]:
# Remove special characters

def remove_special(text):
    review=''
    for i in text:
        if i.isalnum():
            review=review+i
        else:
            review=review+' '
    return review

In [19]:
remove_special('This is one of my f@vourite books and I remember watching this series and loving it immensely.')

'This is one of my f vourite books and I remember watching this series and loving it immensely '

In [20]:
df['review']=df['review'].apply(remove_special)

In [21]:
# Converting everything to lower case

def convert_lower(text):
    return text.lower()

In [22]:
df['review']=df['review'].apply(convert_lower)

In [23]:
# Removing Stop Words

def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [24]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
len(stopwords.words('english'))

179

In [26]:
df['review']=df['review'].apply(remove_stopwords)

In [27]:
df

Unnamed: 0,review,sentiment
36715,"[film, fails, capture, mystery, intrigue, book...",0
2849,"[days, writers, directors, producers, relying,...",0
35502,"[movie, great, movie, however, movie, highly, ...",1
28952,"[see, laurel, hardy, purists, might, offended,...",0
28588,"[room, mate, ordered, one, web, back, finally,...",1
...,...,...
8555,"[grew, time, music, movie, popular, wonderful,...",1
18282,"[born, around, time, movie, finished, liberal,...",1
9183,"[watched, film, times, never, really, liked, f...",0
19540,"[acting, movie, sometimes, hard, imagine, cast...",1


In [28]:
# Stemming
ps=PorterStemmer()                  # Algorithm based on a set of heuristics that are used to remove common suffixes from words.

In [29]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [30]:
stem_words(['played','playing','plays'])

['play', 'play', 'play']

In [31]:
df['review']=df['review'].apply(stem_words)

In [32]:
df

Unnamed: 0,review,sentiment
36715,"[film, fail, captur, mysteri, intrigu, book, o...",0
2849,"[day, writer, director, produc, reli, surpris,...",0
35502,"[movi, great, movi, howev, movi, highli, predi...",1
28952,"[see, laurel, hardi, purist, might, offend, ra...",0
28588,"[room, mate, order, one, web, back, final, got...",1
...,...,...
8555,"[grew, time, music, movi, popular, wonder, tim...",1
18282,"[born, around, time, movi, finish, liber, open...",1
9183,"[watch, film, time, never, realli, like, fan, ...",0
19540,"[act, movi, sometim, hard, imagin, cast, portr...",1


In [33]:
# Join back

def join_back(list_input):
    return ' '.join(list_input)

In [34]:
df['review']=df['review'].apply(join_back)

In [35]:
df

Unnamed: 0,review,sentiment
36715,film fail captur mysteri intrigu book offer ma...,0
2849,day writer director produc reli surpris end ol...,0
35502,movi great movi howev movi highli predict grea...,1
28952,see laurel hardi purist might offend rather ge...,0
28588,room mate order one web back final got around ...,1
...,...,...
8555,grew time music movi popular wonder time music...,1
18282,born around time movi finish liber open mind h...,1
9183,watch film time never realli like fan term nu ...,0
19540,act movi sometim hard imagin cast portray unbr...,1


### Bags of Word Model

In [36]:
cv=CountVectorizer()

In [37]:
X=cv.fit_transform(df['review']).toarray()

In [38]:
X.shape                                # Having 10000 reviews and 36525 unique words.

(10000, 36294)

In [39]:
X[0].max()                             # Some word has occured 13 times in first review.

7

In [40]:
y=df.iloc[:,-1].values

In [41]:
y

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [42]:
y.shape

(10000,)

### Splitting the Data

In [43]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [44]:
print("X_train.shape:",X_train.shape)
print("X_test.shape:",X_test.shape)
print("y_train.shape:",y_train.shape)
print("y_test.shape:",y_test.shape)

X_train.shape: (8000, 36294)
X_test.shape: (2000, 36294)
y_train.shape: (8000,)
y_test.shape: (2000,)


## Building a Machine Learning Model

### Naive Bayes

In [45]:
# GaussianNB

# Model Creation
model1=GaussianNB()

# Model Training
model1.fit(X_train,y_train)

# Predict the result for Test Data(X_test)
y_pred1=model1.predict(X_test)

# Calculate Model Accuracy
accuracy1=accuracy_score(y_test,y_pred1)
print("Accuracy of GaussianNB:",accuracy1)

Accuracy of GaussianNB: 0.626


In [46]:
# MultinomialNB

# Model Creation
model2=MultinomialNB()

# Model Training
model2.fit(X_train,y_train)

# Predict the result for Test Data(X_test)
y_pred2=model2.predict(X_test)

# Calculate Model Accuracy
accuracy2=accuracy_score(y_test,y_pred2)
print("Accuracy of MultinomialNB:",accuracy2)

Accuracy of MultinomialNB: 0.8455


In [47]:
# BernoulliNB

# Model Creation
model3=BernoulliNB()

# Model Training
model3.fit(X_train,y_train)

# Predict the result for Test Data(X_test)
y_pred3=model3.predict(X_test)

# Calculate Model Accuracy
accuracy3=accuracy_score(y_test,y_pred3)
print("Accuracy of BernoulliNB:",accuracy3)

Accuracy of BernoulliNB: 0.8365


In [48]:
# Bernoulli Naive Bayes is best among above naive bayes methods, because it provides highest accuracy.

### Logistic Regression

In [53]:
# Model Creation
model_LR=LogisticRegression()

# Model Training
model_LR.fit(X_train,y_train)

# Predict the result for Test Data(X_test)
y_pred_LR=model_LR.predict(X_test)

# Calculate Model Accuracy
accuracy_LR=accuracy_score(y_test,y_pred_LR)
print("Accuracy of Logistic Regression:",accuracy_LR)

Accuracy of Logistic Regression: 0.8465


### KNN

In [54]:
for k in range(1,16,2):
    # Model Creation
    model=KNeighborsClassifier(n_neighbors=k)

    # Model Training
    model.fit(X_train,y_train)

    # Predict the result for Test Data(X_test)
    y_pred=model.predict(X_test)

    # Calculate Model Accuracy
    accuracy=accuracy_score(y_test,y_pred)
    print("For k={}, Accuracy:{}".format(k,accuracy))

For k=1, Accuracy:0.579
For k=3, Accuracy:0.613
For k=5, Accuracy:0.6225
For k=7, Accuracy:0.631
For k=9, Accuracy:0.632
For k=11, Accuracy:0.649
For k=13, Accuracy:0.6665
For k=15, Accuracy:0.659


### Results

In [None]:
# Logistic Regression model seemed to have best performance across all feature representations with classification 
# accuracy around 85%.
# General Order of the performance for Model:
# Logistic Regression > Naive Bayes > KNN