### Laboratorio 2 - Aprendizaje Supervisado

In [1]:
from utils import load_datasets_unlabeled_test
train, dev, test = load_datasets_unlabeled_test()

In [2]:
import pandas as pd
data_train = pd.DataFrame({'data': train[0], 'target': train[1]})
data_dev = pd.DataFrame({'data': dev[0], 'target': dev[1]})

In [3]:
data_train.describe()

Unnamed: 0,target
count,963.0
mean,0.500519
std,0.50026
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [4]:
data_dev.describe()

Unnamed: 0,target
count,107.0
mean,0.495327
std,0.502331
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## División: Train, Dev y Test

Hay 1070 instancias etiquetadas, de las cuales se utilizaran:
  - Train: 90% (963)
  - Dev: 10% (107)
  
Hay 500 instancias no etiquetadas que son de test    
  - Test: (500)
  
Notar que la cantidad de datos etiquetados (de train) son relativamente pocos respecto a los no etiquetados (de test). Probablemente hay que intentar aumentar los datos de train.

In [5]:
X_train, y_train = data_train['data'].tolist(), data_train['target'].tolist()
X_dev, y_dev = data_dev['data'].tolist(), data_dev['target'].tolist()

## CountVectorizer +  LinearSVC:

In [6]:
# El argumento del vectorizador debe ser una lista
X_train, y_train = data_train['data'].tolist(), data_train['target'].tolist()
X_dev, y_dev = data_dev['data'].tolist(), data_dev['target'].tolist()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LinearSVC(random_state=0)),
])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..., max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])

In [7]:
from utils import print_short_eval
print_short_eval(pipeline, X_train, y_train)

accuracy	1.00	macro f1	1.00


In [8]:
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.79	macro f1	0.79


### Experimento: Binarizar Conteos

In [9]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..., max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])

In [10]:
print_short_eval(pipeline, X_train, y_train)

accuracy	1.00	macro f1	1.00


In [11]:
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.83	macro f1	0.83


#### Conclusión: Binarizar los conteos es una buena elección ya que aumenta notablemente la accuracy.

## Distintos Modelos de Clasificación

Probamos distintos modelos de clasificación usando los valores por defecto.

Evaluamos en train (bias) y en dev (variance).

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [13]:
vect = CountVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.73	macro f1	0.72
accuracy	0.56	macro f1	0.51
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.97	macro f1	0.97
accuracy	0.87	macro f1	0.87
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.67	macro f1	0.67
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	1.00	macro f1	1.00
accuracy	0.87	macro f1	0.87
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.83	macro f1	0.83
<class 'sklearn.svm.classes.SVC'>


  'precision', 'predicted', average, warn_for)


accuracy	0.50	macro f1	0.33
accuracy	0.50	macro f1	0.33
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.77	macro f1	0.76


## Ahora probamos con el Vectorizador TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(binary=True)
for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.86	macro f1	0.86
accuracy	0.83	macro f1	0.83
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.99	macro f1	0.99
accuracy	0.84	macro f1	0.84
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.70	macro f1	0.70
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	0.99	macro f1	0.99
accuracy	0.88	macro f1	0.88
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.87	macro f1	0.87
<class 'sklearn.svm.classes.SVC'>


  'precision', 'predicted', average, warn_for)


accuracy	0.50	macro f1	0.33
accuracy	0.50	macro f1	0.33
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.75	macro f1	0.74


### Conclusiones:

- Utilizando el Vectorizador TF-IDF el modelo KNeighbors mejora considerablemente. (acc = 83%)
- Para MultinomialNB conviene utilizar CountVectorizer. (acc = 87%)
- LogisticRegression y LinearSVC mejoran con el uso de TF-IDF (acc = 88% y 87% respectivamente).
- Es conveniente descartar los modelos restantes.

### IDEA: 
Ajustar hiperparámetros en los modelos más prometedores y luego utilizar VotingClassifier utilizando la mejor versión de cada uno de ellos.

Primero probemos VotingClassifier sin ajustar hiperparámetros

In [15]:
from sklearn.ensemble import VotingClassifier

clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', MultinomialNB()),
])

vect = TfidfVectorizer(binary=True)
clf2 = Pipeline([
    ('vect', vect),
    ('clf', KNeighborsClassifier()),
])
clf3 = Pipeline([
    ('vect', vect),
    ('clf', LogisticRegression(random_state=0)),
])
clf4 = Pipeline([
    ('vect', vect),
    ('clf', LinearSVC(random_state=0)),
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('knc', clf2), ('lsvc', clf4)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print_short_eval(eclf1, X_train, y_train)
print_short_eval(eclf1, X_dev, y_dev)

accuracy	0.98	macro f1	0.98
accuracy	0.91	macro f1	0.91


  if diff:
  if diff:


In [16]:
clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', MultinomialNB()),
])

vect = TfidfVectorizer(binary=True)
clf2 = Pipeline([
    ('vect', vect),
    ('clf', KNeighborsClassifier()),
])
clf3 = Pipeline([
    ('vect', vect),
    ('clf', LogisticRegression(random_state=0)),
])
clf4 = Pipeline([
    ('vect', vect),
    ('clf', LinearSVC(random_state=0)),
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('knc', clf2), ('lr', clf3)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print_short_eval(eclf1, X_train, y_train)
print_short_eval(eclf1, X_dev, y_dev)

accuracy	0.98	macro f1	0.98
accuracy	0.91	macro f1	0.91


  if diff:
  if diff:


# Resultados hasta el momento:

Mejor accuracy es conseguida con VotingClassifier utilizando, de los 4 modelos candidatos, todos menos LogisticRegression o bien todos menos LinearSVC.

# Pasos a seguir

Ajustar hiperparametros de los 4 modelos lo mejor posible y volver a intentar VotingClassifier.

Primero evaluamos test con el mejor modelo actual.

In [17]:
y_pred = eclf1.predict(test)

  if diff:


In [18]:
predictions = y_pred.tolist()
from utils import save_results
save_results('results.csv', predictions)

Obtuvimos un accuracy del 86.66% sobre los datos de test.

### Intentemos mejorar LogisticRegression (acc dev actual = 88%)

In [19]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.99	macro f1	0.99
accuracy	0.88	macro f1	0.88


Nota: Si se utiliza stop_words en el vectorizador el modelo obtiene peor performance.
(en este caso particular, no es una regla general).

### Grid-Search en Dev

In [20]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

60

In [21]:
from utils import eval

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#    print(result)
    results.append({
        **result,
        **params,
    })

In [22]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
11,0.88785,0,0.887841,True,0.95,5,"(1, 2)"
31,0.88785,0,0.887841,True,0.9,5,"(1, 2)"
51,0.88785,0,0.887841,True,0.7,5,"(1, 2)"
17,0.88785,0,0.887762,True,0.95,7,"(1, 3)"
18,0.88785,0,0.887762,True,0.95,7,"(1, 4)"
19,0.88785,0,0.887762,True,0.95,7,"(1, 5)"
37,0.88785,0,0.887762,True,0.9,7,"(1, 3)"
38,0.88785,0,0.887762,True,0.9,7,"(1, 4)"
39,0.88785,0,0.887762,True,0.9,7,"(1, 5)"
57,0.88785,0,0.887762,True,0.7,7,"(1, 3)"


Fijamos algunos de los mejores hiperparametros del vectorizador y ahora ajustamos algunos hiperparámetros de LogisticRegression.

In [23]:
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 2), (1,3)],
    'vect__min_df': [5, 7],
    'vect__max_df': [0.95, 0.9],
    'clf__random_state': [0],
    # parameter for LogisticRegression (smaller values -> stronger regularization)
    'clf__C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 1.0]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

64

In [24]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#    print(result)
    results.append({
        **result,
        **params,
    })

In [25]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
40,0.897196,0.3,0,0.89716,True,0.95,5,"(1, 2)"
41,0.897196,0.3,0,0.89716,True,0.95,5,"(1, 3)"
44,0.897196,0.3,0,0.89716,True,0.9,5,"(1, 2)"
45,0.897196,0.3,0,0.89716,True,0.9,5,"(1, 3)"
48,0.897196,0.5,0,0.89716,True,0.95,5,"(1, 2)"
52,0.897196,0.5,0,0.89716,True,0.9,5,"(1, 2)"
49,0.88785,0.5,0,0.887841,True,0.95,5,"(1, 3)"
53,0.88785,0.5,0,0.887841,True,0.9,5,"(1, 3)"
56,0.88785,1.0,0,0.887841,True,0.95,5,"(1, 2)"
60,0.88785,1.0,0,0.887841,True,0.9,5,"(1, 2)"


Entrenamos nuestro modelo con los mejores hiperparámetros obtenidos y luego evaluamos

In [26]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(random_state=0, C=0.3)),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.95	macro f1	0.95
accuracy	0.90	macro f1	0.90


### Se obtuvo una mejora de performance! (de 88% a 90% en dev)

In [27]:
y_pred = pipeline.predict(test)
predictions = y_pred.tolist()
from utils import save_results
save_results('results_LG.csv', predictions)

Se obtuvo un accuracy del 82.66% sobre los datos de test. (solo utilizando LogisticRegression).

## Intentemos mejorar MultinomialNB (acc dev actual = 87%)

In [28]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', MultinomialNB()),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.97	macro f1	0.97
accuracy	0.87	macro f1	0.87


### Grid-Search en Dev

In [29]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

60

In [30]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#    print(result)
    results.append({
        **result,
        **params,
    })

In [31]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
45,0.88785,0.887841,True,0.7,3,"(1, 1)"
5,0.878505,0.878505,True,0.95,3,"(1, 1)"
25,0.878505,0.878505,True,0.9,3,"(1, 1)"
10,0.878505,0.878462,True,0.95,5,"(1, 1)"
30,0.878505,0.878462,True,0.9,5,"(1, 1)"
50,0.878505,0.878462,True,0.7,5,"(1, 1)"
0,0.869159,0.869147,True,0.95,1,"(1, 1)"
20,0.869159,0.869147,True,0.9,1,"(1, 1)"
40,0.869159,0.869147,True,0.7,1,"(1, 1)"
7,0.869159,0.868873,True,0.95,3,"(1, 3)"


Fijamos algunos de los mejores hiperparametros del vectorizador y ahora ajustamos algunos hiperparámetros de LogisticRegression.

In [32]:
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1)],
    'vect__min_df': [3, 5],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__alpha': [0.0, 0.01, 0.03, 0.1, 0.5, 1.0, 5.0, 8.0, 10.0, 15.0, 20.0]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

66

In [33]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#    print(result)
    results.append({
        **result,
        **params,
    })

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [34]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__alpha,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
53,0.906542,10.0,0.906534,True,0.7,5,"(1, 1)"
45,0.897196,8.0,0.89716,True,0.9,5,"(1, 1)"
49,0.897196,10.0,0.89716,True,0.95,5,"(1, 1)"
51,0.897196,10.0,0.89716,True,0.9,5,"(1, 1)"
28,0.88785,0.5,0.887841,True,0.7,3,"(1, 1)"
34,0.88785,1.0,0.887841,True,0.7,3,"(1, 1)"
55,0.88785,15.0,0.887841,True,0.95,5,"(1, 1)"
57,0.88785,15.0,0.887841,True,0.9,5,"(1, 1)"
59,0.88785,15.0,0.887841,True,0.7,5,"(1, 1)"
37,0.88785,5.0,0.887762,True,0.95,5,"(1, 1)"


Entrenamos nuestro modelo con los mejores hiperparámetros obtenidos y luego evaluamos

In [35]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.70, min_df=5, ngram_range=(1,1))),
    ('clf', MultinomialNB(alpha=10.0)),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.92	macro f1	0.92
accuracy	0.91	macro f1	0.91


### Se obtuvo una mejora de performance! (de 87% a 91% en dev)

In [36]:
clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.70, min_df=5, ngram_range=(1,1))),
    ('clf', MultinomialNB(alpha=10.0)),
])

vect = TfidfVectorizer(binary=True)
clf2 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', KNeighborsClassifier()),
])
clf3 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(random_state=0, C=0.3))
])
clf4 = Pipeline([
    ('vect', vect),
    ('clf', LinearSVC(random_state=0)),
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('knc', clf2), ('lr', clf3)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print_short_eval(eclf1, X_train, y_train)
print_short_eval(eclf1, X_dev, y_dev)

accuracy	0.94	macro f1	0.94
accuracy	0.93	macro f1	0.93


  if diff:
  if diff:


# TODO: probar evaluar test con este modelo

Al parecer obtenemos mejores resultados usando VotingClassifier con las versiones mejoradas de LG y MultinomialNB. Intentemos mejorar los otros dos modelos

## Intentemos mejorar KNeighborsClassifier (acc dev actual = 83%)

In [37]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', KNeighborsClassifier()),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.86	macro f1	0.86
accuracy	0.83	macro f1	0.83


### Grid-Search in Dev

In [38]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1)],
    'vect__min_df': [1, 3, 5],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__n_neighbors': [1, 2, 5, 8, 10, 20, 30, 50, 100]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

81

In [39]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', KNeighborsClassifier()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#    print(result)
    results.append({
        **result,
        **params,
    })

In [40]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__n_neighbors,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
59,0.915888,30,0.915623,True,0.9,5,"(1, 1)"
56,0.897196,30,0.896872,True,0.95,5,"(1, 1)"
67,0.88785,50,0.887605,True,0.9,3,"(1, 1)"
53,0.88785,20,0.887368,True,0.7,5,"(1, 1)"
80,0.878505,100,0.878505,True,0.7,5,"(1, 1)"
62,0.878505,30,0.878335,True,0.7,5,"(1, 1)"
64,0.878505,50,0.878335,True,0.95,3,"(1, 1)"
47,0.878505,20,0.878121,True,0.95,5,"(1, 1)"
50,0.878505,20,0.878121,True,0.9,5,"(1, 1)"
73,0.869159,100,0.869147,True,0.95,3,"(1, 1)"


Entrenamos nuestro modelo con los mejores hiperparámetros obtenidos y luego evaluamos

In [41]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.90, min_df=5, ngram_range=(1,1))),
    ('clf', KNeighborsClassifier(n_neighbors=30))
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	0.84	macro f1	0.84
accuracy	0.92	macro f1	0.92


### Conclusion: si bien mejora el accuracy de dev, el de train se mantiene casi un 10% por debajo, posiblemente convenga descartar este modelo.

## Intentemos mejorar LinearSVC (acc dev actual = 87%)

In [42]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	1.00	macro f1	1.00
accuracy	0.87	macro f1	0.87


### Grid-Search in Dev

In [43]:
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

60

In [44]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#     print(result)
    results.append({
        **result,
        **params,
    })

In [45]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
12,0.915888,0,0.915888,True,0.95,5,"(1, 3)"
13,0.915888,0,0.915888,True,0.95,5,"(1, 4)"
14,0.915888,0,0.915888,True,0.95,5,"(1, 5)"
32,0.915888,0,0.915888,True,0.9,5,"(1, 3)"
33,0.915888,0,0.915888,True,0.9,5,"(1, 4)"
34,0.915888,0,0.915888,True,0.9,5,"(1, 5)"
52,0.915888,0,0.915888,True,0.7,5,"(1, 3)"
53,0.915888,0,0.915888,True,0.7,5,"(1, 4)"
54,0.915888,0,0.915888,True,0.7,5,"(1, 5)"
11,0.906542,0,0.906534,True,0.95,5,"(1, 2)"


Fijamos algunos de los mejores hiperparametros del vectorizador y ahora ajustamos algunos hiperparámetros de LogisticRegression.

In [46]:
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1,3), (1,4), (1,5)],
    'vect__min_df': [5],
    'vect__max_df': [0.95],
    'clf__random_state': [0],
    'clf__C': [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 1.0, 2.0, 5.0, 10.0]
}

params_list = list(ParameterGrid(param_grid))
len(params_list)

30

In [47]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])
# pipeline.get_params().keys()
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
#     print(result)
    results.append({
        **result,
        **params,
    })

In [48]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
18,0.915888,1.0,0,0.915888,True,0.95,5,"(1, 3)"
19,0.915888,1.0,0,0.915888,True,0.95,5,"(1, 4)"
20,0.915888,1.0,0,0.915888,True,0.95,5,"(1, 5)"
21,0.915888,2.0,0,0.915888,True,0.95,5,"(1, 3)"
22,0.915888,2.0,0,0.915888,True,0.95,5,"(1, 4)"
23,0.915888,2.0,0,0.915888,True,0.95,5,"(1, 5)"
24,0.906542,5.0,0,0.906534,True,0.95,5,"(1, 3)"
25,0.906542,5.0,0,0.906534,True,0.95,5,"(1, 4)"
26,0.906542,5.0,0,0.906534,True,0.95,5,"(1, 5)"
12,0.897196,0.3,0,0.89716,True,0.95,5,"(1, 3)"


Entrenamos nuestro modelo con los mejores hiperparámetros obtenidos y luego evaluamos

In [49]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	1.00	macro f1	1.00
accuracy	0.92	macro f1	0.92


### Se obtuvo una mejora de performance! (de 87% a 92% en dev)

In [50]:
clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.70, min_df=5, ngram_range=(1,1))),
    ('clf', MultinomialNB(alpha=10.0))
])
clf2 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.90, min_df=5, ngram_range=(1,1))),
    ('clf', KNeighborsClassifier(n_neighbors=30)) # ajustando dio que 30 vecinos 
                                                 # es mejor pero solo en dev
])
clf3 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(random_state=0, C=0.3))
])
clf4 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('lr', clf3), ('lsvc', clf4)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print_short_eval(eclf1, X_train, y_train)
print_short_eval(eclf1, X_dev, y_dev)

accuracy	0.96	macro f1	0.96
accuracy	0.91	macro f1	0.91


  if diff:
  if diff:


In [51]:
y_pred = eclf1.predict(test)
predictions = y_pred.tolist()
from utils import save_results
save_results('final_results_usingOnlyTrain.csv', predictions)

  if diff:


Se obtuvo un accuracy del 85.33%

In [52]:
clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.70, min_df=5, ngram_range=(1,1))),
    ('clf', MultinomialNB(alpha=10.0))
])
clf2 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.90, min_df=5, ngram_range=(1,1))),
    ('clf', KNeighborsClassifier(n_neighbors=5)) # ajustando dio que 30 vecinos 
                                                 # es mejor pero solo en dev
])
clf3 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(random_state=0, C=0.3))
])
clf4 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('kn', clf2), ('lr', clf3), ('lsvc', clf4)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print_short_eval(eclf1, X_train, y_train)
print_short_eval(eclf1, X_dev, y_dev)

accuracy	0.95	macro f1	0.95
accuracy	0.93	macro f1	0.93


  if diff:
  if diff:


In [53]:
y_pred = eclf1.predict(test)
predictions = y_pred.tolist()
from utils import save_results
save_results('final_results_usingOnlyTrain2.csv', predictions)

  if diff:


Se obtuvo un accuracy del 85.33%

Probemos entrenando el modelo de LinearSVC con todos los datos etiquetados y luego evaluar test.

In [54]:
# MODELO ACTUAL:

pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	1.00	macro f1	1.00
accuracy	0.92	macro f1	0.92


In [55]:
# UNIMOS train y dev:
train = X_train + X_dev
target = y_train + y_dev

In [56]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])
pipeline.fit(train, target)
print_short_eval(pipeline, train, target)

accuracy	1.00	macro f1	1.00


In [57]:
y_pred = pipeline.predict(test)
predictions = y_pred.tolist()
from utils import save_results
save_results('results_LinearSVC_usingAllLabeledDataset.csv', predictions)

Se obtuvo un accuracy del 88% (el mejor resultado hasta ahora)

In [70]:
clf1 = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.70, min_df=5, ngram_range=(1,1))),
    ('clf', MultinomialNB(alpha=10.0))
])
clf3 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(random_state=0, C=0.3))
])
clf4 = Pipeline([
    ('vect', TfidfVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1,3))),
    ('clf', LinearSVC(random_state=0, C=1.0))
])

eclf1 = VotingClassifier(estimators=[('mnb', clf1), ('lr', clf3), ('lsvc', clf4)],
                         voting='hard')
eclf1 = eclf1.fit(train, target)
print_short_eval(eclf1, train, target)

accuracy	0.96	macro f1	0.96


  if diff:


In [71]:
y_pred = eclf1.predict(test)
predictions = y_pred.tolist()
from utils import save_results
save_results('results_voting_usingAllLabeledDataset.csv', predictions)

  if diff:


Se obtuvo un accuracy del 85.33%

### Pasos a seguir: hacer experimentos pero con mas datos, intentar aumentar la cantidad de datos de entrenamiento artificialmente.

In [115]:
from googletrans import Translator
import pandas as pd
translator = Translator()
train_df = pd.DataFrame({'col':train})
text = train_df.values[0][0].decode("utf-8")
text

"If ever a film needed English subtitles this is one . The accents and soft talking are great but hard to follow storyline as you ca n't understand what they are saying and with no subtitles . Her songs were just beautiful and the story is great but a lot of it is lost on not catching what they are saying . But is was a refreshing movie from most out there now . Fine acting and story ."

In [119]:
translation_es = translator.translate(text, dest='es', src='en')
translation_es.text

'Si alguna vez una película necesitó subtítulos en inglés, esta es una. Los acentos y las conversaciones suaves son geniales pero difíciles de seguir porque no puedes entender lo que dicen y no tienen subtítulos. Sus canciones eran simplemente bellas y la historia es genial, pero se pierde mucho al no captar lo que dicen. Pero fue una película refrescante de la mayoría ahora. Buena actuación e historia.'

In [120]:
translation_en = translator.translate(translation_es.text, dest='en', src='es')
translation_en.text

'If ever a movie needed subtitles in English, this is one. The accents and the soft conversations are great but difficult to follow because you can not understand what they say and they do not have subtitles. Their songs were simply beautiful and the story is great, but you lose a lot by not grasping what they say. But it was a most refreshing movie now. Good acting and history.'

Notar que esta es una buena alternativa para generar nuevos datos de entrenamiento! :)
Probar con otros idiomas no solo español! (frances, portugues e italiano)