# Movie Reviews

In [47]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [48]:
import re
def remove_punctuation(oldtext):
    newtext = re.sub(r'[^A-Za-z]+', ' ', oldtext)
    return newtext

data['clean'] = data['reviews'].apply(remove_punctuation)
data['clean'] = data['clean'].str.lower()
data

Unnamed: 0,target,reviews,clean
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party dri...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard s quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first featur...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ps...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow what a movie it s everything a movie can b...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere can be a commanding actor but he ...
1997,pos,"glory--starring matthew broderick , denzel was...",glory starring matthew broderick denzel washin...
1998,pos,steven spielberg's second epic film on world w...,steven spielberg s second epic film on world w...


In [49]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuetu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [50]:
def filter_stopword(string:str):
    temp = string.split(' ')
    out=[]
    for w in temp:
        if w not in nltk.corpus.stopwords.words('english'):
            #print(w)
            out.append(w)
    return ' '.join(out)

In [51]:
data['clean'] = data['clean'].apply(filter_stopword)

## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [33]:
data['clean'].apply(nltk.bigrams)

0       <generator object bigrams at 0x000001460A68BD60>
1       <generator object bigrams at 0x000001460A6280B0>
2       <generator object bigrams at 0x000001460A628190>
3       <generator object bigrams at 0x000001460A628200>
4       <generator object bigrams at 0x000001460A628270>
                              ...                       
1995    <generator object bigrams at 0x000001460A7D86D0>
1996    <generator object bigrams at 0x000001460A7D8740>
1997    <generator object bigrams at 0x000001460A7D87B0>
1998    <generator object bigrams at 0x000001460A7D8820>
1999    <generator object bigrams at 0x000001460A7D8890>
Name: clean, Length: 2000, dtype: object

In [28]:
list(nltk.bigrams('To be or not to be'.split()))

[('To', 'be'), ('be', 'or'), ('or', 'not'), ('not', 'to'), ('to', 'be')]

In [79]:
# def make_bigram(string=str):
#     return list(nltk.bigrams(string.split()))
# data['clean'] = data['clean'].apply(make_bigram)
# data

In [52]:
data

Unnamed: 0,target,reviews,clean
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go church party drink dr...
1,neg,the happy bastard's quick movie review \ndamn ...,happy bastard quick movie review damn k bug go...
2,neg,it is movies like these that make a jaded movi...,movies like make jaded movie viewer thankful i...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest camelot warner bros first feature lengt...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis mentally unstable man undergoing psyc...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow movie everything movie funny dramatic inte...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere commanding actor always great fil...
1997,pos,"glory--starring matthew broderick , denzel was...",glory starring matthew broderick denzel washin...
1998,pos,steven spielberg's second epic film on world w...,steven spielberg second epic film world war ii...


In [110]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(ngram_range=(2,2)
 
# to use bigrams ngram_range=(2,2)
                           ,stop_words='english'
                           )


In [112]:
X_bi = matrix.fit_transform(data['clean'])

In [113]:
data.loc[data['target'] == 'pos', "num_target"] = int(1)
data.loc[data['target'] == 'neg', "num_target"] = int(0)
data["num_target"] = data["num_target"].astype(int)

In [114]:
y_bow = data["num_target"]

In [115]:
X_bi.shape

(2000, 484721)

⚠️ Please push the exercise once you are done 🙃

## 🏁 

In [121]:
matrix.get_feature_names()

['aa meetings',
 'aaa minor',
 'aaa team',
 'aaaaaaaaah critique',
 'aaaaaaaaah day',
 'aaaaaaaahhhh plot',
 'aaaaaah film',
 'aaaahhhs wows',
 'aahs pixar',
 'aaliyah impressive',
 'aaliyah shabby',
 'aaliyah trish',
 'aalyah enchanting',
 'aalyah mafia',
 'aalyah magical',
 'aamir khan',
 'aardman animation',
 'aaron chuck',
 'aaron convincing',
 'aaron determined',
 'aaron eckhart',
 'aaron edward',
 'aaron gets',
 'aaron moor',
 'aaron pearl',
 'aaron sandra',
 'aaron schneider',
 'aaron spelling',
 'aaron steve',
 'aaron williams',
 'aatish director',
 'ab using',
 'aback honesty',
 'aback unusual',
 'abandon alien',
 'abandon conviction',
 'abandon creation',
 'abandon depth',
 'abandon despite',
 'abandon educating',
 'abandon farm',
 'abandon figured',
 'abandon film',
 'abandon final',
 'abandon friends',
 'abandon goal',
 'abandon left',
 'abandon moment',
 'abandon naturally',
 'abandon noise',
 'abandon preview',
 'abandon principle',
 'abandon rejecting',
 'abandon rescue'

In [162]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_bi, y_bow, test_size= .2)
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)#predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: \n', (100 * round(train_pred_score,4)))
print('Testing Set Accuracy Score: \n', (100 * round(test_pred_score,4)))


Training Set Accuracy Score: 
 100.0
Testing Set Accuracy Score: 
 74.75


# On va maintenant essayer avec trigrammes et avec bigrammes + trigrammes

In [100]:
matrix3 = CountVectorizer(ngram_range=(3,3), 
# to use bigrams ngram_range=(2,2)
                           stop_words='english')

X_tri = matrix3.fit_transform(data['clean'])

In [101]:
matrix3.get_feature_names()

['aa meetings gwennie',
 'aa meetings pick',
 'aa meetings shandling',
 'aaa minor league',
 'aaa team buzz',
 'aaa team offers',
 'aaaaaaaaah critique completely',
 'aaaaaaaaah day sees',
 'aaaaaaaahhhh plot grown',
 'aaaaaah film savior',
 'aaaahhhs wows let',
 'aahs pixar work',
 'aaliyah impressive expect',
 'aaliyah shabby thought',
 'aaliyah trish day',
 'aalyah enchanting voice',
 'aalyah mafia intrigues',
 'aalyah magical voice',
 'aamir khan muslim',
 'aardman animation produced',
 'aaron chuck norris',
 'aaron convincing comes',
 'aaron determined free',
 'aaron eckhart amy',
 'aaron eckhart ben',
 'aaron eckhart jason',
 'aaron eckhart lauren',
 'aaron eckhart matt',
 'aaron eckhart playing',
 'aaron edward norton',
 'aaron gets weirder',
 'aaron moor tamora',
 'aaron pearl harrison',
 'aaron sandra bullock',
 'aaron schneider reminiscient',
 'aaron spelling couple',
 'aaron spelling produced',
 'aaron spelling territory',
 'aaron steve martin',
 'aaron williams secretly',
 

In [122]:
X_tri

<2000x600314 sparse matrix of type '<class 'numpy.int64'>'
	with 613942 stored elements in Compressed Sparse Row format>

In [126]:
from scipy.sparse import hstack
X_multi = hstack((X_bi, X_tri))

In [157]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_tri, y_bow, test_size= .2)
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)#predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: \n', (100 * round(train_pred_score,4)))
print('Testing Set Accuracy Score: \n', (100 * round(test_pred_score,4)))


Training Set Accuracy Score: 
 100.0
Testing Set Accuracy Score: 
 52.75


Résultat : Bigramme etait mieux 

# On va essayer un  autre classifieur:

In [164]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: \n', (100 * round(train_pred_score,4)))
print('Testing Set Accuracy Score: \n', (100 * round(test_pred_score,4)))

Training Set Accuracy Score: 
 100.0
Testing Set Accuracy Score: 
 75.25


In [167]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='rbf',gamma=0.5) # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: \n', (100 * round(train_pred_score,4)))
print('Testing Set Accuracy Score: \n', (100 * round(test_pred_score,4)))

Training Set Accuracy Score: 
 100.0
Testing Set Accuracy Score: 
 47.75


In [168]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='poly') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: \n', (100 * round(train_pred_score,4)))
print('Testing Set Accuracy Score: \n', (100 * round(test_pred_score,4)))

Training Set Accuracy Score: 
 94.31
Testing Set Accuracy Score: 
 52.5
