# Deep Learning - Practico 1

Integrantes:
* Mario Ferreyra
* Emiliano Kokic
* Francisco Crespo

---

### Ejercicio 1

In [36]:
import pandas as pd
import numpy as np
import pickle
import keras.backend as K

from keras.models import Sequential
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import regularizers
from keras.layers import Dense, Dropout, Embedding, Lambda
from keras import optimizers
from keras import utils
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

In [36]:
# Lo usamos para la repetibilidad de resultados
np.random.seed(5)

In [37]:
# Carga de datos y conversion de estos en vectores.
def load_dataset():
    dataset = load_files('./dataset/review_polarity/txt_sentoken', shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=42)

    print('Training samples {}, test_samples {}'.format(
        len(X_train), len(X_test)))

    # TODO 1: Apply the Tfidf vectorizer to create input matrix
    vectorizer1 = TfidfVectorizer(max_features = 10000)
    vector1 = vectorizer1.fit_transform(X_train)
    
    vectorizer2 = TfidfVectorizer(max_features = 10000)
    vector2 = vectorizer2.fit_transform(X_test)

    return vector1, vector2, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = load_dataset()

Training samples 1500, test_samples 500


In [9]:
# Mostramos las 10000 features más importantes
data_example = pd.DataFrame(X_train.toarray())
data_example.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023454,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.05202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# TODO 2: Convert the labels to categorical
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_train_cat = le.fit_transform(y_train)
y_test_cat = le.fit_transform(y_test)

In [11]:
# Función que genera los modelos y que se usara en la grilla de validación cruzada.
def build_model(nodes1 = 100, nodes2 = 200, lr = 0.001, 
                l2 = 0.01, input_shape = 10000, drop = 0.1):
    
    model = Sequential()
    model.add(Dense(nodes1,
                    activation = 'relu',
                    kernel_initializer = 'random_normal', 
                    input_shape = (input_shape, ),
                    kernel_regularizer = regularizers.l2(l2)))
    model.add(Dropout(drop))
    
    if(nodes2 != 0):
        model.add(Dense(nodes2,
                        activation = 'relu',
                        kernel_initializer = 'random_normal',
                        kernel_regularizer = regularizers.l2(l2)))
        model.add(Dropout(drop))
        model.add(Dense(2, activation = 'softmax'))
        

    opt = optimizers.Adam(lr = lr)
    model.compile(loss = 'sparse_categorical_crossentropy',
                  optimizer = opt, 
                  metrics = ['accuracy'])

    return model

In [12]:
INPUT_SHAPE = 10000
EPOCHS = 15
BATCH_SIZE = 64

# Construcción del clasificador a partir de la función que crea el modelo
model = KerasClassifier(build_fn = build_model, epochs = EPOCHS,
                        batch_size = BATCH_SIZE, verbose = 0)

In [13]:
# Definición de los valores de la grilla para la cross validation
nodes1 = [32, 64, 128]
nodes2 = [32, 64, 128, 256]
lrs = [0.001, 0.002, 0.003]
l2s = [0.0001, 0.001, 0.01]
drops = [0.1, 0.2, 0.3]

param_grid = dict(l2 = l2s, drop = drops)
param_grid

{'l2': [0.0001, 0.001, 0.01], 'drop': [0.1, 0.2, 0.3]}

In [14]:
grid = GridSearchCV(estimator = model, param_grid = param_grid, 
                    cv = 3, n_jobs = 1, refit = True, verbose = 2)
grid_result = grid.fit(X_train, y_train_cat)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] drop=0.1, l2=0.0001 .............................................
[CV] .............................. drop=0.1, l2=0.0001, total=   4.5s
[CV] drop=0.1, l2=0.0001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s


[CV] .............................. drop=0.1, l2=0.0001, total=   4.2s
[CV] drop=0.1, l2=0.0001 .............................................
[CV] .............................. drop=0.1, l2=0.0001, total=   4.7s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=   4.6s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=   4.5s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=   4.5s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] ................................ drop=0.1, l2=0.01, total=   4.7s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] ................................ drop=0.1, l2=0.01, total=   4.5s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.3min finished


In [15]:
# Mostrar mejor resultado
print("Mejores Precisión: %f con %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) con: %r" % (mean, stdev, param))

Mejores Precisión: 0.854667 con {'drop': 0.3, 'l2': 0.0001}
0.848667 (0.006600) con: {'drop': 0.1, 'l2': 0.0001}
0.841333 (0.011813) con: {'drop': 0.1, 'l2': 0.001}
0.837333 (0.027047) con: {'drop': 0.1, 'l2': 0.01}
0.852667 (0.006799) con: {'drop': 0.2, 'l2': 0.0001}
0.838000 (0.013367) con: {'drop': 0.2, 'l2': 0.001}
0.835333 (0.019137) con: {'drop': 0.2, 'l2': 0.01}
0.854667 (0.009843) con: {'drop': 0.3, 'l2': 0.0001}
0.849333 (0.012472) con: {'drop': 0.3, 'l2': 0.001}
0.832667 (0.022291) con: {'drop': 0.3, 'l2': 0.01}


In [42]:
final_model = build_model(l2 = 0.0001, drop = 0.3)
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_103 (Dense)            (None, 100)               1000100   
_________________________________________________________________
dropout_69 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_104 (Dense)            (None, 200)               20200     
_________________________________________________________________
dropout_70 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_105 (Dense)            (None, 2)                 402       
Total params: 1,020,702
Trainable params: 1,020,702
Non-trainable params: 0
_________________________________________________________________


In [43]:
EPOCHS = 20
BATCH_SIZE = 64
history = final_model.fit(x = X_train, y = y_train_cat, batch_size = BATCH_SIZE, epochs = EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [46]:
print('Accuracy en test:', final_model.evaluate(X_test, y_test_cat)[1])

Accuracy en test: 0.4780000002384186


Claramente estamos frente a un caso de overfitting.

Probemos aumentar el parámetro de regularización asi como también el porcentaje de Dropout

In [49]:
final_model = build_model(l2 = 0.01, drop = 0.5)
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_109 (Dense)            (None, 100)               1000100   
_________________________________________________________________
dropout_73 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_110 (Dense)            (None, 200)               20200     
_________________________________________________________________
dropout_74 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_111 (Dense)            (None, 2)                 402       
Total params: 1,020,702
Trainable params: 1,020,702
Non-trainable params: 0
_________________________________________________________________


In [50]:
EPOCHS = 20
BATCH_SIZE = 64
history = final_model.fit(x = X_train, y = y_train_cat, batch_size = BATCH_SIZE, epochs = EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [51]:
print('Accuracy en test:', final_model.evaluate(X_test, y_test_cat)[1])

Accuracy en test: 0.5160000004768371


Al aumentar el parámetro de regularización y el porcentaje de dropouts el modelo generaliza un poco más.

### Ejercicio 2

In [27]:
def transform_input(instances, mapping):
    """
    Replaces the words in instances with their index in mapping.

    Args:
        instances: a list of text instances.
        mapping: an dictionary from words to indices.

    Returns:
        A matrix with shape (n_instances, max_text_length).
    """
    word_indices = []
    for instance in instances:
        word_indices.append([mapping[word.decode('utf-8')]
                             for word in instance.split()])
    # Check consistency
    assert len(instances[0].split()) == len(word_indices[0])

    # Pad the sequences to obtain a matrix instead of a list of lists.
    from keras.preprocessing.sequence import pad_sequences

    return pad_sequences(word_indices)

In [28]:
with open('./filteredFastText', 'rb') as model_file:
    filtered_fasttext = pickle.load(model_file)

In [29]:
def load_dataset2():
    dataset = load_files('./dataset/review_polarity/txt_sentoken',
                         shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                        dataset.target,
                                                        test_size=0.1, # antes estaba en 0.25
                                                        random_state=42)
    print('Training samples {}, test_samples {}'
          .format(len(X_train), len(X_test)))

    return X_train, X_test, y_train, y_test

In [30]:
X_train, X_test, y_train, y_test_original = load_dataset2()

Training samples 1800, test_samples 200


In [31]:
X_train = transform_input(X_train, filtered_fasttext.word2index)

In [32]:
# Función que genera los modelos y que se usara en la grilla de validación cruzada.
def build_model2(nodes1 = 100, nodes2 = 200, lr = 0.001, 
                l2 = 0.01, drop = 0.1):
    
    model = Sequential()  
    model.add(
        Embedding(
            filtered_fasttext.wv.shape[0],  # Vocabulary size
            filtered_fasttext.wv.shape[1],  # Embedding size
            weights=[filtered_fasttext.wv],  # Word vectors
            trainable=False  # This indicates the word vectors must not be
        )                    # changed during training.
    )
    model.add(
        Lambda(lambda xin: K.mean(xin, axis=1), name='embedding_average')
    #     Lambda(lambda xin: K.concatenate([K.min(xin, axis=1), K.max(xin, axis=1)]),
    #            name='embedding_min_max')
    )      
    model.add(Dense(nodes1,
                    activation = 'relu',
                    kernel_initializer = 'random_normal', 
                    kernel_regularizer = regularizers.l2(l2)
             )
    )      
    model.add(Dropout(drop))   
    if(nodes2 != 0):
        model.add(Dense(nodes2,
                        activation = 'relu',
                        kernel_initializer = 'random_normal',
                        kernel_regularizer = regularizers.l2(l2)))
        model.add(Dropout(drop))
        model.add(Dense(1, activation = 'sigmoid'))  

    opt = optimizers.Adadelta(lr = lr)
    model.compile(loss = 'binary_crossentropy',
                  optimizer = opt, 
                  metrics = ['accuracy'])

    return model

In [33]:
EPOCHS = 15
BATCH_SIZE = 64
# Construcción del clasificador a partir de la función que crea el modelo
model = KerasClassifier(build_fn = build_model2, epochs = EPOCHS,
                        batch_size = BATCH_SIZE, verbose = 0)

In [34]:
# Definición de los valores de la grilla para la cross validation
nodes1 = [32, 64, 128]
nodes2 = [32, 64, 128, 256]
lrs = [0.001, 0.002, 0.003]
l2s = [0.0001, 0.001, 0.01]
drops = [0.1, 0.2, 0.3]

param_grid = dict(l2 = l2s, drop = drops)
param_grid

{'l2': [0.0001, 0.001, 0.01], 'drop': [0.1, 0.2, 0.3]}

In [37]:
grid = GridSearchCV(estimator = model, param_grid = param_grid, 
                    cv = 3, n_jobs = 1, refit = True, verbose = 2)
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] drop=0.1, l2=0.0001 .............................................
[CV] .............................. drop=0.1, l2=0.0001, total=   6.9s
[CV] drop=0.1, l2=0.0001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s


[CV] .............................. drop=0.1, l2=0.0001, total=   8.4s
[CV] drop=0.1, l2=0.0001 .............................................
[CV] .............................. drop=0.1, l2=0.0001, total=  10.3s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=  12.6s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=  13.3s
[CV] drop=0.1, l2=0.001 ..............................................
[CV] ............................... drop=0.1, l2=0.001, total=  13.3s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] ................................ drop=0.1, l2=0.01, total=  13.2s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] ................................ drop=0.1, l2=0.01, total=  13.0s
[CV] drop=0.1, l2=0.01 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  6.3min finished


In [38]:
# Mostrar mejor resultado
print("Mejores Precisión: %f con %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) con: %r" % (mean, stdev, param))

Mejores Precisión: 0.513889 con {'drop': 0.3, 'l2': 0.0001}
0.481667 (0.008498) con: {'drop': 0.1, 'l2': 0.0001}
0.486111 (0.014678) con: {'drop': 0.1, 'l2': 0.001}
0.481667 (0.008498) con: {'drop': 0.1, 'l2': 0.01}
0.485556 (0.005666) con: {'drop': 0.2, 'l2': 0.0001}
0.483333 (0.007201) con: {'drop': 0.2, 'l2': 0.001}
0.503889 (0.019830) con: {'drop': 0.2, 'l2': 0.01}
0.513889 (0.014678) con: {'drop': 0.3, 'l2': 0.0001}
0.462222 (0.033948) con: {'drop': 0.3, 'l2': 0.001}
0.498333 (0.018708) con: {'drop': 0.3, 'l2': 0.01}


In [40]:
final_model = build_model2(l2 = 0.0001, drop = 0.3)
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, None, 300)         15276000  
_________________________________________________________________
embedding_average (Lambda)   (None, 300)               0         
_________________________________________________________________
dense_85 (Dense)             (None, 100)               30100     
_________________________________________________________________
dropout_57 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_86 (Dense)             (None, 200)               20200     
_________________________________________________________________
dropout_58 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_87 (Dense)             (None, 1)                 201       
Total para

In [42]:
EPOCHS = 20
BATCH_SIZE = 64
history = final_model.fit(x = X_train, y = y_train, batch_size = BATCH_SIZE, epochs = EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
X_test = transform_input(X_test, filtered_fasttext.word2index)

In [52]:
print('Accuracy en test:', final_model.evaluate(X_test, y_test_original)[1])

Accuracy en test: 0.495
