# Movies Reviews Sentiment Analysis 03 --- Mario Ferreyra
---

### Zona de _imports_

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

from pprint import pprint
from collections import Counter
from evaluator import Evaluator
from tqdm import tqdm  # https://github.com/tqdm/tqdm
from tokenizer import ReviewTokenizer
from preprocessing import PreprocessingReview
from utils import (load_datasets_unlabeled_test,
                   save_csv_results,
                   save_pickle_model, load_pickle_model,
                   get_best_params)

In [2]:
# Cargamos el Evaluator
evaluator = Evaluator()

### Cargamos el _dataset_

In [3]:
train, dev, test = load_datasets_unlabeled_test()

In [4]:
pprint(train[0][0])
pprint(train[0][1])

(b'If ever a film needed English subtitles this is one . The accents and soft t'
 b"alking are great but hard to follow storyline as you ca n't understand what "
 b'they are saying and with no subtitles . Her songs were just beautiful and th'
 b'e story is great but a lot of it is lost on not catching what they are sayin'
 b'g . But is was a refreshing movie from most out there now . Fine acting and '
 b'story .')
(b"`` Just Married '' is a painfully cheesy movie that 's almost too lightheart"
 b'ed and cute . Ashton Kutcher and Brittany Murphy play a Romeo and Juliet-esq'
 b"e couple that has been repeated in so many movies , its sickening - he 's th"
 b"e classic dorky sports fan and child of middle-income parents , and she 's t"
 b'he daughter of some billionare whose profession we never learn . Her parents'
 b' would have rather married her to the snobby Peter Prentice , a dull antagon'
 b'ist who is more of a roadblock then an actual character . Anyway , the cute '
 b'couple get m

In [5]:
df_train = pd.DataFrame({'data': train[0], 'target': train[1]})
df_dev = pd.DataFrame({'data': dev[0], 'target': dev[1]})

In [6]:
print("Train")
print("-----")
print("Shape 'Train' =", df_train.shape)
display(df_train.head(10))

print("Dev")
print("---")
print("Shape 'Dev' =", df_dev.shape)
display(df_dev.head(10))

Train
-----
Shape 'Train' = (963, 2)


Unnamed: 0,data,target
0,"b""If ever a film needed English subtitles this...",1
1,"b""`` Just Married '' is a painfully cheesy mov...",0
2,"b""I may not be able to add much to the reviews...",1
3,"b""I 've been a 3D nut for many decades . I pre...",0
4,"b""When I found out this version of Lonesome Do...",0
5,b'This review is to point out that this versio...,0
6,"b""If you look at the fine print on this DVD , ...",0
7,"b""This movie is based on a true story , and I ...",0
8,"b""... but a terrible DVD . The sound is plain ...",0
9,"b""From time to time I 've revisited this movie...",1


Dev
---
Shape 'Dev' = (107, 2)


Unnamed: 0,data,target
0,"b""In this dvd you have , the Boston POPS orche...",1
1,b'This show is the best . I was slow to get in...,1
2,"b""I have a couple of the Lucy collections , an...",0
3,"b""A Better Way To Die is a action-packed , dar...",1
4,"b""This is an unusual Merchant/Ivory film that ...",1
5,"b""I 'll admit that I 've never seen films by M...",1
6,"b""Being a huge vampire/horror fan and a fan of...",0
7,"b'As many reviewers put it , this is definitel...",1
8,"b""While it 's true the once-brilliant `` Soap ...",0
9,"b""This movie barely touches on the holocaust ....",0


In [7]:
print("Train Describe")
print("--------------")
display(df_train.describe())

print("=" * 15)

print("Dev Describe")
print("------------")
display(df_dev.describe())

Train Describe
--------------


Unnamed: 0,target
count,963.0
mean,0.500519
std,0.50026
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Dev Describe
------------


Unnamed: 0,target
count,107.0
mean,0.495327
std,0.502331
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


### Separamos los datos en _**X**_ e _**y**_

In [8]:
X_train = df_train['data'].tolist()
y_train = df_train['target'].tolist()

X_dev = df_dev['data'].tolist()
y_dev = df_dev['target'].tolist()

X_test = test

In [9]:
print("Counter 'y_train' =", Counter(y_train))
print("Counter 'y_dev' =", Counter(y_dev))

Counter 'y_train' = Counter({1: 482, 0: 481})
Counter 'y_dev' = Counter({0: 54, 1: 53})


---
##  Evaluemos varios modelos sobre _**Train**_ y _**Dev**_

En particular vamos usar.

Vectorizadores:
* CountVectorizer
* TfidfVectorizer

Clasificadores:
* LinearSVC
* LogisticRegression
* RandomForestClassifier

In [10]:
clfs = {
    'LinearSVC': LinearSVC(random_state=0),
    'LogisticRegression': LogisticRegression(random_state=0),
    'LogisticRegressionCV': LogisticRegressionCV(random_state=0),
    'RandomForestClassifier': RandomForestClassifier(random_state=0),
}

In [11]:
vect = CountVectorizer(binary=True)

print("Vectorizador: CountVectorizer\n")
for name, clf in clfs.items():
    print(" -----> {} <-----".format(name))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf)
    ])
    pipeline.fit(X_train, y_train)
    evaluator.print_short_eval(pipeline, X_train, y_train)
    evaluator.print_short_eval(pipeline, X_dev, y_dev)
    print("==============")

Vectorizador: CountVectorizer

 -----> LinearSVC <-----
Accuracy = 1.00  |  Macro F1 = 1.00
Accuracy = 0.83  |  Macro F1 = 0.83
 -----> LogisticRegression <-----
Accuracy = 1.00  |  Macro F1 = 1.00
Accuracy = 0.87  |  Macro F1 = 0.87
 -----> LogisticRegressionCV <-----
Accuracy = 1.00  |  Macro F1 = 1.00
Accuracy = 0.88  |  Macro F1 = 0.88
 -----> RandomForestClassifier <-----
Accuracy = 0.99  |  Macro F1 = 0.99
Accuracy = 0.77  |  Macro F1 = 0.76


In [12]:
vect = TfidfVectorizer(binary=True)

print("Vectorizador: TfidfVectorizer\n")
for name, clf in clfs.items():
    print(" -----> {} <-----".format(name))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf)
    ])
    pipeline.fit(X_train, y_train)
    evaluator.print_short_eval(pipeline, X_train, y_train)
    evaluator.print_short_eval(pipeline, X_dev, y_dev)
    print("==============")

Vectorizador: TfidfVectorizer

 -----> LinearSVC <-----
Accuracy = 1.00  |  Macro F1 = 1.00
Accuracy = 0.87  |  Macro F1 = 0.87
 -----> LogisticRegression <-----
Accuracy = 0.99  |  Macro F1 = 0.99
Accuracy = 0.88  |  Macro F1 = 0.88
 -----> LogisticRegressionCV <-----
Accuracy = 1.00  |  Macro F1 = 1.00
Accuracy = 0.88  |  Macro F1 = 0.88
 -----> RandomForestClassifier <-----
Accuracy = 0.99  |  Macro F1 = 0.99
Accuracy = 0.75  |  Macro F1 = 0.74


Notar que usando a 'secas':
* Vectorizador: TfidfVectorizer
* Clasificador: LogisticRegressionCV

Obtenemos mejores resultados. Por lo cual vamos a usar estos.

In [13]:
vect = TfidfVectorizer(
    binary=True,
)

clf = LogisticRegressionCV(random_state=0)

pipeline = Pipeline([
    ('vect', vect),
    ('clf', clf),
])

pipeline.fit(X_train, y_train)

evaluator.print_eval(pipeline, X_dev, y_dev)

Accuracy = 0.88

             precision    recall  f1-score   support

        neg       0.90      0.85      0.88        54
        pos       0.86      0.91      0.88        53

avg / total       0.88      0.88      0.88       107

[[46  8]
 [ 5 48]]
