In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('../data/movie_reviews.zip')
data.head()

Unnamed: 0,review,sentimiento
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Corpus normalization

In [3]:
def clean_corpus(data, column):
    """Convert to lowercase and remove special characters"""
    filter_pattern = re.compile(r'[^a-z\s]+')
    data[column] = data[column].astype(str).str.lower()
    data[column] = data[column].str.replace(filter_pattern, '', regex=True)

    return data

In [4]:
for column in data.columns:
    data = clean_corpus(data, column)
data.head()

Unnamed: 0,review,sentimiento
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(data['review'], data['sentimiento'], test_size = .30)

## 1.- Multinomial NB without Stop words

**a) With make_pipeline**

In [6]:
model_1a = make_pipeline(CountVectorizer(stop_words=None), MultinomialNB()) 
model_1a.fit(X_train, Y_train)

0,1,2
,steps,"[('countvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [7]:
Yprediction_1a = model_1a.predict(X_test) 

In [8]:
accuracy_score(Y_test, Yprediction_1a)

0.8463333333333334

**b) Without make_pipeline**

In [9]:
vectorizer1b = CountVectorizer(stop_words=None)
X_train_vec1b = vectorizer1b.fit_transform(X_train)
X_test_vec1b = vectorizer1b.transform(X_test)

In [10]:
model_1b = MultinomialNB()
model_1b.fit(X_train_vec1b, Y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [11]:
Yprediction_1b = model_1b.predict(X_test_vec1b) 

In [12]:
accuracy_score(Y_test, Yprediction_1b)

0.8463333333333334

## 2.- Multinomial NB with Stop words

In [13]:
model_2 = make_pipeline(CountVectorizer(stop_words="english"), MultinomialNB())
model_2.fit(X_train, Y_train)

0,1,2
,steps,"[('countvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [14]:
Yprediction_2 = model_2.predict(X_test)

In [15]:
accuracy_score(Y_test, Yprediction_2)

0.8550666666666666

## 3.- Multinomial NB using TF-IDF without Stop words

In [16]:
model_3 = make_pipeline(TfidfVectorizer(stop_words=None), MultinomialNB())
model_3.fit(X_train, Y_train)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [17]:
Yprediction_3 = model_3.predict(X_test)

In [18]:
accuracy_score(Y_test, Yprediction_3)

0.8644666666666667

## 4.- Multinomial NB using TD-IDF and Stop words

In [19]:
model_4 = make_pipeline(TfidfVectorizer(stop_words="english"), MultinomialNB())
model_4.fit(X_train, Y_train)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [20]:
Yprediction_4 = model_4.predict(X_test)

In [21]:
accuracy_score(Y_test, Yprediction_4)

0.8678

## Example of three misclassified instances

In [22]:
# Multinomial NB Model with TD-IDF & Stop words
idx_error = []
for i in range(len(data)):
  if data.sentimiento[i] != Yprediction_4[i]:
    idx_error.append(i)
    if len(idx_error) == 3:
      break

for j in idx_error:
  print(data.review[j]+"\n",data.sentimiento[j], Yprediction_4[j]+"\n")

one of the other reviewers has mentioned that after watching just  oz episode youll be hooked they are right as this is exactly what happened with mebr br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordbr br it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awaybr br i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forg