# Sentiment Analyisis: Ajuste de Parámetros

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

## Estado del Arte Actual

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.88      0.86       162
        pos       0.85      0.82      0.83       138

avg / total       0.85      0.85      0.85       300

[[142  20]
 [ 25 113]]


## Vectorizador

Primero hagamos un estudio superficial para ver qué parámetros vale la pena analizar.

### Rango de n-gramas

In [20]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        ngram_range=(1, 2),
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.87

             precision    recall  f1-score   support

        neg       0.87      0.88      0.88       162
        pos       0.86      0.85      0.85       138

avg / total       0.87      0.87      0.87       300

[[143  19]
 [ 21 117]]


### Min Frequency

In [21]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        #min_df=5,
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.88      0.86       162
        pos       0.85      0.82      0.83       138

avg / total       0.85      0.85      0.85       300

[[142  20]
 [ 25 113]]


### Max Frequency

In [22]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        max_df=0.7,
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.86      0.86       162
        pos       0.84      0.83      0.83       138

avg / total       0.85      0.85      0.85       300

[[140  22]
 [ 24 114]]


### Stop words

In [23]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        stop_words='english',
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.85      0.86      0.86       162
        pos       0.83      0.83      0.83       138

avg / total       0.84      0.84      0.84       300

[[139  23]
 [ 24 114]]


### Grid-Search en Development

Probemos muchas las combinaciones posibles de valores.

In [24]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))

In [25]:
params_list[10]
len(params_list)

60

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from util import eval

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

results = []
for params in params_list:
    # TODO: add progress bar!
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

¡Excelente!

Conclusiones:
1. Tenemos dos mejores configuraciones.
2. Para próximas búsquedas podemos descartar algunos valores.

Elegimos la siguiente configuración:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        min_df=3,
        max_df=0.90,
        ngram_range=(1, 5),
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

Evaluamos en test y guardamos el modelo:

In [None]:
print_eval(pipeline, X_test, y_test)
from util import save_model
save_model(pipeline, '2018-07-27_count_logreg')

### Grid-Search con Cross Validation

**¡Ejercicio!**

En lugar de hacer la búsqueda sobre dev, hacer 5-fold cross validation sobre la unión de train y dev.