# IMDB Movie Reviews Sentiment Analysis

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
import re
from nltk.stem import PorterStemmer

In [3]:
import spacy
nlp=spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS as stopwords 

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [6]:
import pickle

## Loading dataset

In [7]:
df=pd.read_csv("IMDB-Dataset.csv")

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Balanced Dataset

In [9]:
df.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

## Cleaning Dataset

### Renaming columns

In [10]:
df=df.rename(columns={"sentiment":"sent","review":"rev"})

###  contractions to expansions list

In [11]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring'}

### Functions

#### Stemming Function

In [12]:
def porter(x):
    return PorterStemmer().stem(x)

#### lementising Function

In [13]:
def lemmer(x):
    val=[]
    doc=nlp(x)
    for token in doc:
        if (token.lemma_=="-PRON-") or (token.lemma_==" be "):
            val.append(token.text)
        else:
            val.append(token.lemma_)
    return " ".join(val)

#### Special word remover, Stop word removal, lower casing, html tag removal Function

In [14]:
def worder(x):
    val="!@#$%^&*()1234'\"5<>:;?/67890.,-_"
    x=x.lower()
    x=re.sub(r"<.*?>"," ",x)
    val1=[]
    for char in x:
        if char not in val:
            val1.append(char)
    x= "".join(val1)
    x= " ".join([t for t in x.split() if t not in stopwords ])
    return x

#### Converting contractions to expansions Function

In [15]:
def cont_exp(x):
    for keys in contractions:
        val=contractions[keys]
        x=x.replace(keys,val)
    return x

#### Functions applied on the dataset

In [16]:
df["clean"]=df.rev.apply(lambda x: cont_exp(x))

In [17]:
df["clean"]=df.clean.apply(lambda x: worder(x))

In [18]:
##df.clean=df.clean.apply(lambda x: lemmer(x))

In [19]:
df.head()

Unnamed: 0,rev,sent,clean
0,One of the other reviewers has mentioned that ...,positive,reviewers mentioned watching oz episode youll ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


## Creating a bag of words

In [20]:
#creating bag of words
y=np.array(df.sent.values)
cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(df.clean).toarray()

#### Shape of X and y 

In [21]:
print(X.shape,y.shape)

(50000, 1000) (50000,)


## Train-Test split

In [22]:
xtrain, xtest, ytrain, ytest=train_test_split(X, y, test_size=0.3, random_state=123)

#### Shape of xtrain, xtest, ytrain, ytest

In [23]:
print(xtrain.shape,xtest.shape, ytrain.shape,ytest.shape)

(35000, 1000) (15000, 1000) (35000,) (15000,)


## Model creation

In [24]:
sv=SVC()
rf=RandomForestClassifier()
mn=MultinomialNB(alpha=1.0, fit_prior=True)


In [26]:
mn.fit(xtrain, ytrain)

MultinomialNB()

In [27]:
rf.fit(xtrain, ytrain)

RandomForestClassifier()

## Predictiion and accurcay 

In [31]:
predict_rf=rf.predict(xtest)


In [32]:
predict_mn=mn.predict(xtest)

In [45]:
def acc(y,pred):
    print("--Accuracy-->", accuracy_score(y,pred))
    print("--ConfuMat-->\n","\n",confusion_matrix(y,pred))
    print("\n--ClassRep-->\n","\n", classification_report(y,pred))

In [55]:
acc(ytest,predict_rf)

In [56]:
acc(ytest,predict_mn)

In [49]:
shawshankRedemtion="I have never seen such an amazing film since I saw The Shawshank Redemption. Shawshank encompasses friendships, hardships, hopes, and dreams. And what is so great about the movie is that it moves you, it gives you hope. Even though the circumstances between the characters and the viewers are quite different, you don't feel that far removed from what the characters are going through.It is a simple film, yet it has an everlasting message. Frank Darabont didn't need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him. Why this movie didn't win all seven Oscars is beyond me, but don't let that sway you to not see this film, let its ranking on the IMDb's top 250 list sway you, let your friends recommendation about the movie sway you.Set aside a little over two hours tonight and rent this movie. You will finally understand what everyone is talking about and you will understand why this is my all time favorite movie."

In [50]:
shawshankRedemtion

"I have never seen such an amazing film since I saw The Shawshank Redemption. Shawshank encompasses friendships, hardships, hopes, and dreams. And what is so great about the movie is that it moves you, it gives you hope. Even though the circumstances between the characters and the viewers are quite different, you don't feel that far removed from what the characters are going through.It is a simple film, yet it has an everlasting message. Frank Darabont didn't need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him. Why this movie didn't win all seven Oscars is beyond me, but don't let that sway you to not see this film, let its ranking on the IMDb's top 250 list sway you, let your friends recommendation about the movie sway you.Set aside a little over two hours tonight and rent this movie. You will finally understand what everyone is talking about and you will understand why this is my all time favorite movie."

In [51]:
f1=cont_exp(shawshankRedemtion)

In [52]:
f2=lemmer(f1)

In [53]:
f3=worder(f2)

In [59]:
f4=cv.transform([f3])

In [60]:
f4.shape

(1, 1000)

In [61]:
rf.predict(f4)

array(['positive'], dtype=object)

In [65]:
coolie="Sense less comedy, poor acting by Sara Ali Khan and Varun Dhawan. Old cooli no1 is very good.. But this one crossed limit in Bakwass.. My Suggestion is don't watch this movie.. Waste of 2 hours"

In [66]:
f1=cont_exp(coolie)

In [67]:
f2=lemmer(f1)

In [68]:
f3=worder(f2)

In [69]:
f4=cv.transform([f3])

In [71]:
rf.predict(f4)

array(['negative'], dtype=object)