In [130]:
"""
Exercise 2
"""
import pandas
import pickle
import argparse

import keras.backend as K
# K is just another name for the keras backend: tensorflow (or theaso,
# if you are using a different backend).
from keras.layers import Embedding, Average, Lambda, Dense, Dropout
from keras.models import Sequential
from keras import utils, optimizers, regularizers
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from utils import FilteredFastText

In [89]:
def load_dataset():
    dataset = load_files('./dataset/review_polarity/txt_sentoken',
                         shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                        dataset.target,
                                                        test_size=0.1, # antes estaba en 0.25
                                                        random_state=42)
    print('Training samples {}, test_samples {}'
          .format(len(X_train), len(X_test)))

    return X_train, X_test, y_train, y_test

In [90]:
def transform_input(instances, mapping):
    """
    Replaces the words in instances with their index in mapping.

    Args:
        instances: a list of text instances.
        mapping: an dictionary from words to indices.

    Returns:
        A matrix with shape (n_instances, max_text_length).
    """
    word_indices = []
    for instance in instances:
        word_indices.append([mapping[word.decode('utf-8')]
                             for word in instance.split()])

    # Check consistency
    assert len(instances[0].split()) == len(word_indices[0])

    # Pad the sequences to obtain a matrix instead of a list of lists.
    from keras.preprocessing.sequence import pad_sequences

    return pad_sequences(word_indices)

### Pipeline

In [91]:
# args = read_args()
args = {'num_units': 100, 'dropout': 0.5, 'batch_size': 32, 'epochs': 10,
        'experiment_name': 'mlp_test', 'embeddings_filename': 'filteredFastText'}
X_train, X_test, y_train, y_test_original = load_dataset()
# TODO 1: Convert the labels to categorical -- DONE
num_classes = 2 # POS and NEG
print(y_train[0])
# y_train = utils.to_categorical(y_train, num_classes)
# y_test_original = utils.to_categorical(y_test_original, num_classes)
# Load the filtered FastText word vectors, using only the vocabulary in
# the movie reviews dataset
with open(args['embeddings_filename'], 'rb') as model_file:
    filtered_fasttext = pickle.load(model_file)

Training samples 1800, test_samples 200
1


In [92]:
print(y_train[0])
print(y_train[1])
len(filtered_fasttext.get_vector('hello'))

1
0


300

In [93]:
X_train[0].split()[:5]

[b'robert', b'redford', b'is', b'very', b'good']

In [94]:
# The next thing to do is to choose how we are going to represent our
# training matrix. Each review must be translated into a single vector.
# This means we have to combine, somehow, the word vectors of each
# word in the review. Some options are:
#  - Take the average of all vectors.
#  - Take the minimum and maximum value of each feature.
# All these operations are vectorial and easier to compute using a GPU.
# Then, it is better to put them inside the Keras model.

# The Embedding layer will be quite handy in solving this problem for us.
# To use this layer, the input to the network has to be the indices of the
# words on the embedding matrix.
X_train = transform_input(X_train, filtered_fasttext.word2index)

In [117]:
# The input is ready, start the model
model = Sequential()
model.add(
    Embedding(
        filtered_fasttext.wv.shape[0],  # Vocabulary size
        filtered_fasttext.wv.shape[1],  # Embedding size
        weights=[filtered_fasttext.wv],  # Word vectors
        trainable=False  # This indicates the word vectors must not be
    )                    # changed during training.
)

"""
The output here has shape
    (batch_size (?), words_in_reviews (?), embedding_size)
To use a Dense layer, the input must have only 2 dimensions. We need to
create a single representation for each document, combining the word
embeddings of the words in the instance.
For this, we have to use a Tensorflow (K) operation directly.
The operation we need to do is to take the average of the embeddings
on the second dimension. We wrap this operation on a Lambda
layer to include it into the model.
"""
model.add(
    Lambda(lambda xin: K.mean(xin, axis=1), name='embedding_average')
#     Lambda(lambda xin: K.concatenate([K.min(xin, axis=1), K.max(xin, axis=1)]),
#            name='embedding_min_max')
)
# Now the output shape is (batch_size (?), embedding_size)

# TODO 2: Finish the Keras model
# Add all the layers

model.add(
    Dense(10, activation='relu')
)
# model.add(
#     Dropout(0.5)
# )
model.add(
    Dense(10, activation='relu')
)
model.add(
    Dense(1, activation='sigmoid')
)

In [118]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, None, 300)         15276000  
_________________________________________________________________
embedding_average (Lambda)   (None, 300)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 10)                3010      
_________________________________________________________________
dense_60 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_61 (Dense)             (None, 1)                 11        
Total params: 15,279,131
Trainable params: 3,131
Non-trainable params: 15,276,000
_________________________________________________________________


In [132]:
# SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [133]:
model.compile(loss='binary_crossentropy', # probar categorical sino
              optimizer=optimizers.Adagrad(lr=0.001, decay=0.0001), 
              # También podría ser el string "Adagrad" con los parámetros por defecto
              metrics=['accuracy'])  # La métrica sirve para llevar algún registro además del costo

In [134]:
# TODO 3: Fit the model -- DONE
history = model.fit(X_train, y_train, batch_size=args['batch_size'], epochs=10,
                    validation_split=0.1, verbose=1)

Train on 1620 samples, validate on 180 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [128]:
# history.model.get_weights()

[array([[-0.7438    , -0.1831    ,  0.2255    , ..., -0.0414    ,
          0.2202    ,  0.0802    ],
        [-0.0995    , -0.034     , -0.2414    , ...,  0.0221    ,
         -0.3485    , -0.0184    ],
        [ 0.02206548, -0.3473758 , -1.1992983 , ..., -0.0711825 ,
          0.36806315, -2.5473688 ],
        ...,
        [-0.1464    ,  0.0687    , -0.1861    , ..., -0.1806    ,
          0.1267    ,  0.1475    ],
        [ 0.2922    , -0.2796    , -0.6978    , ..., -0.0651    ,
          0.3773    ,  0.2411    ],
        [-0.3755    ,  0.189     ,  0.1702    , ...,  0.1788    ,
         -0.1416    , -0.1588    ]], dtype=float32),
 array([[ 0.02506906,  0.10300997,  0.09203143, ...,  0.03028385,
          0.00413218, -0.01831461],
        [ 0.04128695, -0.0692794 ,  0.08152859, ...,  0.04899672,
         -0.06285532, -0.1115238 ],
        [-0.07333675, -0.04321398,  0.07058871, ...,  0.04996576,
         -0.10683268,  0.01793701],
        ...,
        [ 0.0366157 ,  0.11417486,  0.0

In [None]:
"""
TODO 4: Evaluate the model, calculating the metrics. -- DONE

Option 1: Use the model.evaluate() method. For this, the model must be
          already compiled with the metrics.
"""
# performance = model.evaluate(transform_input(X_test), y_test)

"""
Option 2: Use the model.predict() method and calculate the metrics using
          sklearn. We recommend this, because you can store
          the predictions if you need more analysis later.
          Also, if you calculate the metrics on a notebook,
          then you can compare multiple classifiers.
"""
predictions = model.predict(y_train)
accuracy = accuracy_score(y_test_original, predictions)
f1_score = f1_score(y_test_original, predictions)

print('accuracy in test:', accuracy)
print('f1_score in test:', f1_score)

# TODO 5: Save the results.
# ...

# One way to store the predictions:
results = pandas.DataFrame(y_test_original, columns=['true_label'])
results.loc[:, 'predicted'] = predictions
results.to_csv('predictions_{}.csv'.format(args.experiment_name),
               index=False)
