# Movies Reviews Sentiment Analysis 05 --- Mario Ferreyra

## Grid-Search: *CountVectorizer* + *LogisticRegressionCV*
---

### Zona de _imports_

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

from pprint import pprint
from collections import Counter
from evaluator import Evaluator
from tqdm import tqdm  # https://github.com/tqdm/tqdm
from tokenizer import ReviewTokenizer
from preprocessing import PreprocessingReview
from utils import (load_datasets,
                   load_datasets_unlabeled_test,
                   save_csv_results,
                   save_pickle_model,
                   load_pickle_model,
                   get_best_params)

In [2]:
# Cargamos el Evaluator
evaluator = Evaluator()

### Cargamos el _dataset_

In [3]:
train, dev, test = load_datasets_unlabeled_test()

In [4]:
pprint(train[0][0])
pprint(train[0][1])

(b'If ever a film needed English subtitles this is one . The accents and soft t'
 b"alking are great but hard to follow storyline as you ca n't understand what "
 b'they are saying and with no subtitles . Her songs were just beautiful and th'
 b'e story is great but a lot of it is lost on not catching what they are sayin'
 b'g . But is was a refreshing movie from most out there now . Fine acting and '
 b'story .')
(b"`` Just Married '' is a painfully cheesy movie that 's almost too lightheart"
 b'ed and cute . Ashton Kutcher and Brittany Murphy play a Romeo and Juliet-esq'
 b"e couple that has been repeated in so many movies , its sickening - he 's th"
 b"e classic dorky sports fan and child of middle-income parents , and she 's t"
 b'he daughter of some billionare whose profession we never learn . Her parents'
 b' would have rather married her to the snobby Peter Prentice , a dull antagon'
 b'ist who is more of a roadblock then an actual character . Anyway , the cute '
 b'couple get m

In [5]:
df_train = pd.DataFrame({'data': train[0], 'target': train[1]})
df_dev = pd.DataFrame({'data': dev[0], 'target': dev[1]})

In [6]:
print("Train")
print("-----")
print("Shape 'Train' =", df_train.shape)
display(df_train.head(10))

print("Dev")
print("---")
print("Shape 'Dev' =", df_dev.shape)
display(df_dev.head(10))

Train
-----
Shape 'Train' = (963, 2)


Unnamed: 0,data,target
0,"b""If ever a film needed English subtitles this...",1
1,"b""`` Just Married '' is a painfully cheesy mov...",0
2,"b""I may not be able to add much to the reviews...",1
3,"b""I 've been a 3D nut for many decades . I pre...",0
4,"b""When I found out this version of Lonesome Do...",0
5,b'This review is to point out that this versio...,0
6,"b""If you look at the fine print on this DVD , ...",0
7,"b""This movie is based on a true story , and I ...",0
8,"b""... but a terrible DVD . The sound is plain ...",0
9,"b""From time to time I 've revisited this movie...",1


Dev
---
Shape 'Dev' = (107, 2)


Unnamed: 0,data,target
0,"b""In this dvd you have , the Boston POPS orche...",1
1,b'This show is the best . I was slow to get in...,1
2,"b""I have a couple of the Lucy collections , an...",0
3,"b""A Better Way To Die is a action-packed , dar...",1
4,"b""This is an unusual Merchant/Ivory film that ...",1
5,"b""I 'll admit that I 've never seen films by M...",1
6,"b""Being a huge vampire/horror fan and a fan of...",0
7,"b'As many reviewers put it , this is definitel...",1
8,"b""While it 's true the once-brilliant `` Soap ...",0
9,"b""This movie barely touches on the holocaust ....",0


In [7]:
print("Train Describe")
print("--------------")
display(df_train.describe())

print("=" * 15)

print("Dev Describe")
print("------------")
display(df_dev.describe())

Train Describe
--------------


Unnamed: 0,target
count,963.0
mean,0.500519
std,0.50026
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Dev Describe
------------


Unnamed: 0,target
count,107.0
mean,0.495327
std,0.502331
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


### Separamos los datos en _**X**_ e _**y**_

In [8]:
X_train = df_train['data'].tolist()
y_train = df_train['target'].tolist()

X_dev = df_dev['data'].tolist()
y_dev = df_dev['target'].tolist()

X_test = test

In [9]:
print(type(X_train), type(y_train))
print(type(X_dev), type(y_dev))
print(type(X_test))

<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'>


In [10]:
print("Len(X_train) =", len(X_train))
print("Counter 'y_train' =", Counter(y_train))
print("")
print("Len(X_dev) =", len(X_dev))
print("Counter 'y_dev' =", Counter(y_dev))

Len(X_train) = 963
Counter 'y_train' = Counter({1: 482, 0: 481})

Len(X_dev) = 107
Counter 'y_dev' = Counter({0: 54, 1: 53})


### Cargamos otro Dataset (El visto en Clase)

In [11]:
train_2, dev_2, test_2 = load_datasets()

X_train_2, y_train_2 = train_2
X_dev_2, y_dev_2 = dev_2
X_test_2, y_test_2 = test_2

In [12]:
print(type(X_train_2), type(y_train_2))
print(type(X_dev_2), type(y_dev_2))
print(type(X_test_2), type(y_test_2))

<class 'list'> <class 'numpy.ndarray'>
<class 'list'> <class 'numpy.ndarray'>
<class 'list'> <class 'numpy.ndarray'>


In [13]:
y_train_2 = y_train_2.tolist()
y_dev_2 = y_dev_2.tolist()
y_test_2 = y_test_2.tolist()

print(type(y_train_2))
print(type(y_dev_2))
print(type(y_test_2))

<class 'list'>
<class 'list'>
<class 'list'>


In [14]:
print("Len(X_train_2) =", len(X_train_2))
print("Counter 'y_train_2' =", Counter(y_train_2))
print("")
print("Len(X_dev_2) =", len(X_dev_2))
print("Counter 'y_dev_2' =", Counter(y_dev_2))
print("")
print("Len(X_test_2) =", len(X_test_2))
print("Counter 'y_test_2' =", Counter(y_test_2))

Len(X_train_2) = 1200
Counter 'y_train_2' = Counter({1: 619, 0: 581})

Len(X_dev_2) = 300
Counter 'y_dev_2' = Counter({0: 162, 1: 138})

Len(X_test_2) = 500
Counter 'y_test_2' = Counter({0: 257, 1: 243})


In [15]:
X_train_new = X_train_2 + X_dev_2 + X_test_2
y_train_new = y_train_2 + y_dev_2 + y_test_2

print("Len(X_train_new) =", len(X_train_new))
print("Counter 'y_train_new' =", Counter(y_train_new))

Len(X_train_new) = 2000
Counter 'y_train_new' = Counter({1: 1000, 0: 1000})


---
## Hagamos un Grid-Search sobre el Development

In [16]:
param_grid = {    
    'vect__strip_accents': [None, 'ascii'],
    'vect__stop_words': [None],
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__analyzer': ['word'],
    'vect__tokenizer': [None],
    'vect__ngram_range': [(1, 2), (1, 3)],
    'vect__min_df': [3, 4],
    'vect__max_df': [0.3, 0.5, 0.7],
    'clf__random_state': [0],
    'clf__cv': [3],
    'clf__class_weight': [None, 'balanced'],
    'clf__scoring': ['accuracy'],
}

param_list = list(ParameterGrid(param_grid))

In [17]:
len_param_list = len(param_list)
len_param_list

48

In [18]:
vect = CountVectorizer()
clf = LogisticRegressionCV()

pipeline = Pipeline([
    ('vect', vect),
    ('clf', clf),
])

results = []
for params in tqdm(param_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train + X_train_new,
                 y_train + y_train_new)
    result = evaluator.evaluate(pipeline, X_dev, y_dev)
    results.append({
        **result,
        **params,
    })

100%|██████████| 48/48 [18:13<00:00, 22.79s/it]


In [19]:
results_df = pd.DataFrame(results)
#print(results_df)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__class_weight,clf__cv,clf__random_state,clf__scoring,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__stop_words,vect__strip_accents,vect__tokenizer
10,0.897196,,3,0,accuracy,0.896872,word,True,True,0.5,3,"(1, 3)",,,
11,0.897196,,3,0,accuracy,0.896872,word,True,True,0.5,3,"(1, 3)",,ascii,
18,0.897196,,3,0,accuracy,0.896872,word,True,True,0.7,3,"(1, 3)",,,
19,0.897196,,3,0,accuracy,0.896872,word,True,True,0.7,3,"(1, 3)",,ascii,
22,0.897196,,3,0,accuracy,0.896872,word,True,True,0.7,4,"(1, 3)",,,
23,0.897196,,3,0,accuracy,0.896872,word,True,True,0.7,4,"(1, 3)",,ascii,
34,0.897196,balanced,3,0,accuracy,0.896872,word,True,True,0.5,3,"(1, 3)",,,
35,0.897196,balanced,3,0,accuracy,0.896872,word,True,True,0.5,3,"(1, 3)",,ascii,
42,0.897196,balanced,3,0,accuracy,0.896872,word,True,True,0.7,3,"(1, 3)",,,
43,0.897196,balanced,3,0,accuracy,0.896872,word,True,True,0.7,3,"(1, 3)",,ascii,


In [20]:
best_params_vect_clf = get_best_params(results_df, length=2)
print()
print("Mejores parametros Vectorizador")
print("-------------------------------")
for best_params_vect, _ in best_params_vect_clf:
    pprint(best_params_vect)

print("\n===============================\n")

print("Mejores parametros Clasificador")
print("-------------------------------")
for _, best_params_clf in best_params_vect_clf:
    pprint(best_params_clf)
print()


Mejores parametros Vectorizador
-------------------------------
{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': None,
 'tokenizer': None}
{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': 'ascii',
 'tokenizer': None}


Mejores parametros Clasificador
-------------------------------
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}



In [21]:
my_preds = []
for best_params_vect, best_params_clf in best_params_vect_clf:
    pprint(best_params_vect)
    pprint(best_params_clf)
    print("\n--------------\n")
    vect = CountVectorizer(**best_params_vect)
    clf = LogisticRegressionCV(**best_params_clf)

    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])

    pipeline.fit(X_train + X_train_new,
                 y_train + y_train_new)

    evaluator.print_eval(pipeline, X_dev, y_dev)

    my_pred = pipeline.predict(X_test)

    print("Mi Prediccion")
    print("-------------")
    #print(my_pred)
    my_preds.append(my_pred)
    print("=" * 79)

{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': None,
 'tokenizer': None}
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}

--------------

Accuracy = 0.90

             precision    recall  f1-score   support

        neg       0.86      0.94      0.90        54
        pos       0.94      0.85      0.89        53

avg / total       0.90      0.90      0.90       107

[[51  3]
 [ 8 45]]
Mi Prediccion
-------------
{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': 'ascii',
 'tokenizer': None}
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}

--------------

Accuracy = 0.90

             precision    recall  f1-score   support

        neg       0.86      0.94      0.90        54
        pos       0.94      0.85      0.89        53

avg 

In [22]:
my_preds = []
for best_params_vect, best_params_clf in best_params_vect_clf:
    pprint(best_params_vect)
    pprint(best_params_clf)
    print("\n--------------\n")
    vect = CountVectorizer(**best_params_vect)
    clf = LogisticRegressionCV(**best_params_clf)

    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])

    pipeline.fit(X_train + X_train_new + X_dev,
                 y_train + y_train_new + y_dev)

    #evaluator.print_eval(pipeline, X_dev, y_dev)

    my_pred = pipeline.predict(X_test)

    #print("Mi Prediccion")
    #print("-------------")
    #print(my_pred)
    my_preds.append(my_pred)
    print("=" * 79)

{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': None,
 'tokenizer': None}
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}

--------------

{'analyzer': 'word',
 'binary': True,
 'lowercase': True,
 'max_df': 0.5,
 'min_df': 3,
 'ngram_range': (1, 3),
 'stop_words': None,
 'strip_accents': 'ascii',
 'tokenizer': None}
{'class_weight': None, 'cv': 3, 'random_state': 0, 'scoring': 'accuracy'}

--------------



In [23]:
for i, my_pred in enumerate(my_preds, start=1):
    filename = "results-countLRCV-gs-{}.csv".format(str(i).zfill(2))
    save_csv_results(filename, my_pred)

In [24]:
#save_pickle_model(pipeline, 'TFLogCV-GS.pickle')