# Sentiment Analysis - CampusX

### libraries

In [174]:
import numpy as np
import pandas as pd
import os
import re
import nltk #natural language tool kit
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

### importing the datasets

In [175]:
dataset = pd.read_csv(r'D:\copy of htdocs\practice\Python\200days\Day121 simple project #2\archive\IMDB Dataset.csv')

In [176]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### one review

In [177]:
dataset['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### text cleaning
1. Sample 10000 words
2. Remove html tags
3. Remove special characters
4. to lowercase
5. removing conjuctions
6. stemming

In [178]:
dataset = dataset.sample(10000)

In [179]:
dataset.shape

(10000, 2)

In [180]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 37367 to 43047
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


### replacing the labels

In [181]:
dataset['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [182]:
dataset.head

<bound method NDFrame.head of                                                   review  sentiment
37367  'Five Days' is billed as something special, a ...          1
3205   I was so excited and hyped up about watching t...          0
24059  For the viewer who comes upon it long after it...          1
47261  (Spoilers Ahead!) This same exact plot from th...          0
46895  As usual, on IMDb, going by the majority vote ...          0
...                                                  ...        ...
42224  This flick is worse than awful! It took a good...          0
38411  This was a modest attempt at a film, though it...          0
5033   The various Law & Order and CSI franchises had...          1
6788   Those individuals familiar with Asian cinema, ...          1
43047  I'd just like to say that i've seen this film ...          1

[10000 rows x 2 columns]>

### for removing html tags

In [183]:
clean = re.compile('<.*?>')
re.sub(clean, '',dataset.iloc[2].review)

'For the viewer who comes upon it long after its making, "Winchester \'73" has something in common with "Casablanca." While you watch it, you get this feeling that you\'re looking at a string of clichés encountered so often in the genre; then you realise that the clichés became clichés only after being copied from this particular film, and that they were so widely copied because this film was so great. In other words, it\'s a seminal work."Winchester \'73" is a joy to watch. The broad lines of the plot are somewhat predictable, but mostly because you\'ve seen them copied so many times in later movies, and nevertheless it still contains a number of twists which surprise you. The dialogue, the pacing and Mann\'s direction are excellent. Stewart shines in particular, and if you\'re a fan this is a "must-see," but he is not alone in delivering a good performance. Remarkably, many of the most thoughtful and/or witty lines go to minor characters. Because this makes these characters (much) mo

### using function to clean html tags


In [184]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)

In [185]:
dataset['review']=dataset['review'].apply(clean_html)

### to lower

In [186]:
def con_lower(text):
    return text.lower()


In [187]:
dataset['review']=dataset['review'].apply(con_lower)

### to remove special characters

In [188]:
def rem_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x+=i
        else:
            x=x+' '
    return x

In [189]:
rem_special('Do not @wa#tch this movie, go see something else ... I was very disappointed, I cannot rate this movie any better than 3.')

'Do not  wa tch this movie  go see something else     I was very disappointed  I cannot rate this movie any better than 3 '

In [190]:
dataset['review'] = dataset['review'].apply(rem_special)

### remove the stop words (conjuctions)

In [191]:
dataset

Unnamed: 0,review,sentiment
37367,five days is billed as something special a ...,1
3205,i was so excited and hyped up about watching t...,0
24059,for the viewer who comes upon it long after it...,1
47261,spoilers ahead this same exact plot from th...,0
46895,as usual on imdb going by the majority vote ...,0
...,...,...
42224,this flick is worse than awful it took a good...,0
38411,this was a modest attempt at a film though it...,0
5033,the various law order and csi franchises had...,1
6788,those individuals familiar with asian cinema ...,1


In [192]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### function to remove stopwords

In [193]:
def rem_stopwords(text):
    x=[]
    for i in text.split():
        if i not in  stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [194]:
dataset['review']=dataset['review'].apply(rem_stopwords)

In [195]:
dataset

Unnamed: 0,review,sentiment
37367,"[five, days, billed, something, special, crime...",1
3205,"[excited, hyped, watching, film, promos, first...",0
24059,"[viewer, comes, upon, long, making, winchester...",1
47261,"[spoilers, ahead, exact, plot, movie, done, do...",0
46895,"[usual, imdb, going, majority, vote, instead, ...",0
...,...,...
42224,"[flick, worse, awful, took, good, story, plot,...",0
38411,"[modest, attempt, film, though, appeared, like...",0
5033,"[various, law, order, csi, franchises, better,...",1
6788,"[individuals, familiar, asian, cinema, whole, ...",1


### stemming

In [196]:

ps = PorterStemmer()

### function for stemming

In [197]:
y = []
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [198]:
stem_words(['I','loved','loving','it'])

['i', 'love', 'love', 'it']

In [199]:
dataset['review'] = dataset['review'].apply(stem_words)
dataset

Unnamed: 0,review,sentiment
37367,"[five, day, bill, someth, special, crime, dram...",1
3205,"[excit, hype, watch, film, promo, first, came,...",0
24059,"[viewer, come, upon, long, make, winchest, 73,...",1
47261,"[spoiler, ahead, exact, plot, movi, done, done...",0
46895,"[usual, imdb, go, major, vote, instead, weight...",0
...,...,...
42224,"[flick, wors, aw, took, good, stori, plot, tur...",0
38411,"[modest, attempt, film, though, appear, like, ...",0
5033,"[variou, law, order, csi, franchis, better, gl...",1
6788,"[individu, familiar, asian, cinema, whole, awa...",1


### joining back

In [200]:
def join_back(list_input):
    return " ".join(list_input)

In [201]:
dataset['review']=dataset['review'].apply(join_back)

In [202]:
dataset['review']

37367    five day bill someth special crime drama consi...
3205     excit hype watch film promo first came novemb ...
24059    viewer come upon long make winchest 73 someth ...
47261    spoiler ahead exact plot movi done done ferri ...
46895    usual imdb go major vote instead weight averag...
                               ...                        
42224    flick wors aw took good stori plot turn schizo...
38411    modest attempt film though appear like tv pilo...
5033     variou law order csi franchis better glad dolo...
6788     individu familiar asian cinema whole awar japa...
43047    like say seen film twice love act great even t...
Name: review, Length: 10000, dtype: object

In [203]:
x=dataset.iloc[:,0:1].values
x.shape

(10000, 1)

### vectorizing the data

In [204]:

cv = CountVectorizer(max_features=1000)         #for used 1000 words

In [205]:
x=cv.fit_transform(dataset['review']).toarray()
x.shape

(10000, 1000)

In [206]:
x[0].max()
x[0].mean()

0.1

In [207]:
y=dataset.iloc[:,-1].values
y.shape

(10000,)

### splitting the datas

In [208]:
from sklearn.model_selection import train_test_split


In [209]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [210]:
X_train.shape

(8000, 1000)

In [211]:
y_test.shape

(2000,)

### trainig the model with different methods

In [212]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB


In [213]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

### fitting the model

In [214]:
print(clf1.fit(X_train,y_train))
print(clf2.fit(X_train,y_train))
print(clf3.fit(X_train,y_train))

GaussianNB()
MultinomialNB()
BernoulliNB()


### predictions

In [215]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [216]:
y_test.shape


(2000,)

In [217]:
y_pred1.shape

(2000,)

### accuracy

In [218]:
from sklearn.metrics import accuracy_score

In [219]:
print("Guassian : ",accuracy_score(y_test,y_pred1))
print("Multinomial : ",accuracy_score(y_test,y_pred2))
print("Bernouli : ",accuracy_score(y_test,y_pred3))

Guassian :  0.7885
Multinomial :  0.84
Bernouli :  0.846
