# Analizamos los datos
## *vemos features
## *significado de la clasificacion 1 o 0

In [388]:
from sklearn.datasets import load_files
import pandas as pd

dataset = load_files('review_polarity_competition/reviews_sentoken', shuffle=False)
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [389]:
dataset['target_names'][1] 

'pos'

In [390]:
data = pd.DataFrame({'data': dataset['data'], 'target': dataset['target']})
data.describe()

Unnamed: 0,target
count,1070.0
mean,0.5
std,0.500234
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [391]:
data['data_len'] = data['data'].apply(lambda x: len(x))
data.describe()

Unnamed: 0,target,data_len
count,1070.0,1070.0
mean,0.5,1192.698131
std,0.500234,1158.66631
min,0.0,72.0
25%,0.0,450.25
50%,0.5,798.0
75%,1.0,1532.75
max,1.0,7577.0


In [392]:
data.groupby('target').describe()
# Vemos ditribucion de cantidad de 1 y 0

Unnamed: 0_level_0,data_len,data_len,data_len,data_len,data_len,data_len,data_len,data_len
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,535.0,1006.986916,1029.810276,72.0,388.5,680.0,1273.5,7577.0
1,535.0,1378.409346,1248.164095,81.0,538.0,952.0,1766.0,6897.0


In [393]:
[x for x in dataset['data'] if len(x) < 100]

[b"This `` Director 's Son 's '' cut has been edited down significantly . Why ? I have no idea .",
 b'very bad movie , one of the worst I have seen in a long time . Nuff said',
 b"This movie was so bad , I actually fell asleep . Do n't even waste your time renting this film .",
 b'A real tear-jerker , but a great movie . I had the movie within a week and a half',
 b"I 'm 75 so I did n't want anything too strenuous . This is perfect for the senior ."]

# División de datos

In [394]:
from sklearn.model_selection import train_test_split
docs, X_test, y, y_test = train_test_split(
    dataset.data,
    dataset.target,
    test_size=0.25,
    random_state=42
)



In [395]:
X_train, X_dev, y_train, y_dev = train_test_split(
    docs,
    y,
    test_size=0.2,
    random_state=42)

In [396]:
len(X_train), len(X_dev), len(X_test)
#Cantidad de datos que nos queda para entrenar, para validar y para testear

(641, 161, 268)

In [397]:
from collections import Counter
Counter(y_train), Counter(y_dev), Counter(y_test)

(Counter({0: 311, 1: 330}), Counter({1: 83, 0: 78}), Counter({1: 122, 0: 146}))

## Baselines

In [398]:
import numpy as np
from sklearn.dummy import DummyClassifier

In [399]:
X_train = np.reshape(X_train, (-1, 1))
X_dev = np.reshape(X_dev, (-1, 1))
X_test = np.reshape(X_test, (-1, 1))

Clasificar al azar, respetando la distribución de clases:

In [400]:
clf = DummyClassifier(strategy='stratified', random_state=0)
clf.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='stratified')

## Persistencia
Guardar en disco un modelo:

In [401]:
import pickle
import pickle
filename = '2018-08-15_random_baseline'
f = open(filename, 'wb')
pickle.dump(clf, f)

Cargar un modelo guardado en disco:

In [402]:
filename = '2018-08-15_random_baseline'
f = open(filename, 'rb')
clf = pickle.load(f)

## Evaluación y Métricas

Calcularemos accuracy y macro F1.

En development:

In [403]:
y_pred = clf.predict(X_dev)

from sklearn import metrics
acc = metrics.accuracy_score(y_dev, y_pred)
print('accuracy\t{:2.2f}\n'.format(acc))
print(metrics.classification_report(y_dev, y_pred, target_names=['neg', 'pos']))


accuracy	0.51

             precision    recall  f1-score   support

        neg       0.49      0.55      0.52        78
        pos       0.53      0.47      0.50        83

avg / total       0.51      0.51      0.51       161



Matriz de confusión:

In [404]:
cm = metrics.confusion_matrix(y_dev, y_pred)
print(cm)


[[43 35]
 [44 39]]


Evaluación en test:

In [405]:
y_pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print('accuracy\t{:2.2f}\n'.format(acc))
print(metrics.classification_report(y_test, y_pred, target_names=['neg', 'pos']))

accuracy	0.49

             precision    recall  f1-score   support

        neg       0.53      0.50      0.51       146
        pos       0.44      0.47      0.45       122

avg / total       0.49      0.49      0.49       268



# Vectorizador Bag of Word

In [406]:
from sklearn.feature_extraction.text import CountVectorizer
from util import load_datasets
train, _, _ = load_datasets()
X_train, y_train = train

In [407]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [408]:
vect = CountVectorizer(min_df=5)
vect.fit(X_train)
x = vect.transform(X_train[:1])
x

<1x2391 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [409]:
[(features[i], x[0, i]) for i in range(x.shape[1]) if x[0, i]]

[('1968', 1),
 ('accent', 2),
 ('action scenes', 1),
 ('all around', 1),
 ('all start', 1),
 ('and she', 1),
 ('any', 1),
 ('apparently', 1),
 ('appears', 1),
 ('around', 1),
 ('art and', 1),
 ('at your own', 1),
 ('bbc', 1),
 ('between the two', 1),
 ('bought the dvd', 1),
 ('camera', 1),
 ('chapter', 1),
 ('comedic', 2),
 ('command', 1),
 ('compared', 3),
 ('credit', 1)]

# Prueba de clasificadores y vectorizador TFID

## probamos los clasificadores y vectorizadores para seleccionar aquellos que nos den una currancy mejor

In [410]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

In [411]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from util import print_short_eval

clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [412]:
# Agregamos a nuestro vectorizador el min_df= 5 seleccionado anteriormente
vect = CountVectorizer(binary=True, min_df=5)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.69	macro f1	0.68
accuracy	0.52	macro f1	0.47
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.95	macro f1	0.95
accuracy	0.80	macro f1	0.79
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.66	macro f1	0.66
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	1.00	macro f1	1.00
accuracy	0.84	macro f1	0.84
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.82	macro f1	0.82
<class 'sklearn.svm.classes.SVC'>
accuracy	0.72	macro f1	0.69
accuracy	0.65	macro f1	0.61
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.77	macro f1	0.77


El mejor que obtuvimos es clasificador de regresion logística

In [413]:
# Ahora probamos los clasificadores pero con el vectorizador TFDF
vect = TfidfVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    print_short_eval(pipeline, X_train, y_train)
    print_short_eval(pipeline, X_dev, y_dev)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
accuracy	0.88	macro f1	0.88
accuracy	0.76	macro f1	0.76
<class 'sklearn.naive_bayes.MultinomialNB'>
accuracy	0.99	macro f1	0.99
accuracy	0.85	macro f1	0.85
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
accuracy	1.00	macro f1	1.00
accuracy	0.65	macro f1	0.64
<class 'sklearn.linear_model.logistic.LogisticRegression'>
accuracy	0.99	macro f1	0.99
accuracy	0.84	macro f1	0.84
<class 'sklearn.svm.classes.LinearSVC'>
accuracy	1.00	macro f1	1.00
accuracy	0.88	macro f1	0.88
<class 'sklearn.svm.classes.SVC'>


  'precision', 'predicted', average, warn_for)


accuracy	0.51	macro f1	0.34
accuracy	0.52	macro f1	0.34
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
accuracy	0.99	macro f1	0.99
accuracy	0.63	macro f1	0.62


* Obtenemos un mejor resultado con clasificador Linear SVC

In [414]:
from util import print_eval

pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.88

             precision    recall  f1-score   support

        neg       0.89      0.85      0.87        78
        pos       0.86      0.90      0.88        83

avg / total       0.88      0.88      0.88       161

[[66 12]
 [ 8 75]]


In [415]:
print_eval(pipeline, X_test, y_test)
from util import save_model
save_model(pipeline, '2018-08-20_count_regrelogis')

accuracy	0.82

             precision    recall  f1-score   support

        neg       0.87      0.79      0.83       146
        pos       0.77      0.86      0.81       122

avg / total       0.83      0.82      0.82       268

[[115  31]
 [ 17 105]]


In [416]:
from sklearn.model_selection import ParameterGrid
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))

In [417]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True)),
    ('clf', LinearSVC()),
])

results = []
for params in params_list:

    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

In [418]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
47,0.913043,0,0.912879,True,0.7,3,"(1, 3)"
48,0.913043,0,0.912879,True,0.7,3,"(1, 4)"
49,0.913043,0,0.912879,True,0.7,3,"(1, 5)"
12,0.913043,0,0.912473,True,0.95,5,"(1, 3)"
32,0.913043,0,0.912473,True,0.9,5,"(1, 3)"
52,0.913043,0,0.912473,True,0.7,5,"(1, 3)"
46,0.906832,0,0.906602,True,0.7,3,"(1, 2)"
13,0.906832,0,0.906312,True,0.95,5,"(1, 4)"
33,0.906832,0,0.906312,True,0.9,5,"(1, 4)"
34,0.906832,0,0.906312,True,0.9,5,"(1, 5)"


In [420]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=3,
        max_df=0.70,
        ngram_range=(1, 3)
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.91

             precision    recall  f1-score   support

        neg       0.92      0.90      0.91        78
        pos       0.91      0.93      0.92        83

avg / total       0.91      0.91      0.91       161

[[70  8]
 [ 6 77]]


## Evaluamos el modelo con el test

In [421]:
print_eval(pipeline, X_test, y_test)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.88      0.84      0.86       146
        pos       0.81      0.86      0.84       122

avg / total       0.85      0.85      0.85       268

[[122  24]
 [ 17 105]]


## veamos un poco los coef que obtenemos de nuestro modelo

In [422]:
from analysis import coef_df
df = coef_df(pipeline)

In [423]:
df[:10]

Unnamed: 0,name,coef
12099,worst,-0.911147
1180,bad,-0.902214
11294,version,-0.873802
10132,the worst,-0.812003
2570,disappointed,-0.780253
11554,waste,-0.769968
9241,terrible,-0.767895
7655,poor,-0.714365
11160,unfortunately,-0.692744
6337,money,-0.632748


In [424]:
df[-10:]

Unnamed: 0,name,coef
6582,must,0.614805
3981,great,0.616101
7507,perfect,0.630252
3018,especially,0.655831
3115,excellent,0.669917
3246,fantastic,0.682368
5803,life,0.695182
6848,now,0.716475
2366,day,0.784805
4422,his,0.796931


# probando con stop Word obtenemos un peor resultado

In [364]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=3,
        max_df=0.70,
        ngram_range=(1, 3),
        stop_words='english'
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.88      0.81      0.84        78
        pos       0.83      0.89      0.86        83

avg / total       0.85      0.85      0.85       161

[[63 15]
 [ 9 74]]


# 85 vs 91 conviene continuar sin Stop Word

In [354]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=3,
        max_df=0.70,
        ngram_range=(1, 3)
    )),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.91

             precision    recall  f1-score   support

        neg       0.92      0.90      0.91        78
        pos       0.91      0.93      0.92        83

avg / total       0.91      0.91      0.91       161

[[70  8]
 [ 6 77]]


# Listado de errores

In [425]:
y_pred = pipeline.predict(X_dev)

In [441]:
errors = []
for i, (x, y1, y2, y2p) in enumerate(zip(X_dev, y_dev, y_pred, y_prob)):
    if y1 != y2:
        errors.append({
            'index': i,
            'item': x,
            'true': y1,
            'pred': y2,
            'diff': diff})

errdf = pd.DataFrame(errors)
errdf.sort_values('diff', inplace=True)

In [442]:
errdf

Unnamed: 0,diff,index,item,pred,true
0,-0.715889,5,"b""What can I say about this uninspired `` rete...",1,0
1,-0.715889,11,"b""This is a very good movie . Do n't expect an...",0,1
2,-0.715889,12,"b""I really enjoy Lost Highway and have been wa...",0,1
3,-0.715889,15,"b""This is a very 70s horror flick based on the...",1,0
4,-0.715889,17,"b""Tom Hanks is a great actor , but watching hi...",1,0
5,-0.715889,20,"b""It should have been called `` The Male Booty...",1,0
6,-0.715889,51,"b""Criterion 's release of Alfred Hitchcock 's ...",0,1
7,-0.715889,119,"b""This biopic , based on John Denver 's Take M...",1,0
8,-0.715889,120,"b""I am a senior citizen with bad knees from in...",0,1
9,-0.715889,129,"b""After the first installment in the Blade ser...",0,1


In [468]:
for error in errdf['index']:
    print(error)
    x = X_dev[error]
    print(x.decode('utf-8'))
    print('\n')




5
What can I say about this uninspired `` retelling '' of Bret 's first novel ? The film adaptation does n't contain the scenes and the tone I thought made the book so haunting and memorable . ( The dead boy in the alley scene , the snuff film with the underage girl ... ) This film really ca n't be claimed that it is based on the book - it 's more that it 's inspired by the novel . Despite this downfall , the film is worth watching for any Bret Easton Ellis fan - if only to satisfy one 's curiosity.The film contains neither the perfectly depressing one-liners or the delightfully disturbing minor characters that make the book so interesting and engaging ( Clay 's sisters , Spin ) The film made it seem as though Julian was the protagonist , while I always felt the novel was all about seeing the events unfold through Clay 's eyes . One of the more endearing parts of the novel are Clay 's 'flashbacks ' of a better time , before he left for school . The film only touches on this in the begi

## sitando algunos de los errores

Analizando alguno de los errores vemos que al modelo le esta costando identificar las ironías. Si la idea es clara como "this is a very good movie" no tiene problema. Pero luego las palabras siguiente tienen mayor peso aunque en la oracion no deberian tener mayor importancia.

In [478]:
pipeline.predict(["This is a very good movie . "
                  "Do n't expect any good guys here . "
                  "If you want a hero with a bit of soul watch the Silvester Stallone remake a movie I also like . "
                  "Both movies have a lot of action "])

array([0])

In [479]:
pipeline.predict(["This is a very good movie . "])

array([1])

In [480]:
pipeline.predict(["This is a very good movie . "
                  "Do n't expect any good guys here . "])

array([0])

In [481]:
pipeline.predict(["If you want a hero with a bit of soul watch the Silvester Stallone remake a movie I also like . "])

array([0])

In [482]:
pipeline.predict(["Both movies have a lot of action "])

array([1])