# Análisis del Archivo execise_2.py

In [12]:
import argparse
import keras.backend as K
# K is just another name for the keras backend: tensorflow (or theaso,
# if you are using a different backend).
import pandas
import pickle

from keras.layers import Embedding, Average, Lambda
from keras.models import Sequential
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from utils import FilteredFastText

import numpy as np
import datetime
import os

from tensorflow.python.keras.utils import np_utils   # for tf 1.3.1
#from tensorflow.python.keras import utils as np_utils     # for tf 1.4.1

from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout
from keras import optimizers, regularizers


from itertools import repeat
from IPython.display import display, HTML

from printutils import print_message, print_new_process, print_end


In [13]:

def read_args():
    parser = argparse.ArgumentParser(description='Exercise 2')
    # Here you have some examples of classifier parameters. You can add
    # more arguments or change these if you need to.
    parser.add_argument('--num_units', nargs='+', default=[100], type=int,
                        help='Number of hidden units of each hidden layer.')
    parser.add_argument('--dropout', nargs='+', default=[0.5], type=float,
                        help='Dropout ratio for every layer.')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Number of instances in each batch.')
    parser.add_argument('--experiment_name', type=str, default=None,
                        help='Name of the experiment, used in the filename'
                             'where the results are stored.')
    parser.add_argument('--embeddings_filename', type=str,
                        help='Name of the file with the embeddings.')

    # New parameters:    
    parser.add_argument('--model', type=int, default=10, help='Number of model to run')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
    parser.add_argument('--shuffle', type=str, default='batch', help='Shuffle value')
    parser.add_argument('--random_seed', type=int, default=10, help='Random seed number')
    parser.add_argument('--verbose', type=int, default=1, help='Verbose info on screen')
    
    # parse parameters
    if arguments == None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(arguments)

    assert len(args.num_units) == len(args.dropout)
    return args



In [14]:
def load_dataset():
    dataset = load_files('dataset/txt_sentoken', shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=42)

    print('Training samples {}, test_samples {}'.format(
        len(X_train), len(X_test)))

    return X_train, X_test, y_train, y_test



In [15]:
def transform_input(instances, mapping):
    """Replaces the words in instances with their index in mapping.

    Args:
        instances: a list of text instances.
        mapping: an dictionary from words to indices.

    Returns:
        A matrix with shape (n_instances, max_text_length)."""
    word_indices = []
    for instance in instances:
        word_indices.append([mapping[word.decode('utf-8')]
                             for word in instance.split()])
    # Check consistency
    assert len(instances[0].split()) == len(word_indices[0])

    # Pad the sequences to obtain a matrix instead of a list of lists.
    from keras.preprocessing.sequence import pad_sequences
    return pad_sequences(word_indices)




In [16]:
def main():
    # Load params
    args = read_args()

    # Get datetime of experiment.
    experiment_number = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    # Configurar semilla para facilitar reproducibilidad.
    np.random.seed(args.random_seed)
    
    # Load dataset    
    X_train, X_test, y_train, y_test_orginal = load_dataset()

    # Convert the labels to categorical
    num_classes = 2 
    y_train_cat = np_utils.to_categorical(y_train, num_classes)    
    y_test_cat = np_utils.to_categorical(y_test_orginal, num_classes)    

    # Verify balance of labels in train and test
    # Print count frequency of labels in train and test. 
    #print(np.bincount(y_train))
    #print(np.bincount(y_test_orginal))
    
    # Load the filtered FastText word vectors, using only the vocabulary in
    # the movie reviews dataset
    with open(args.embeddings_filename, 'rb') as model_file:
        filtered_fasttext = pickle.load(model_file)

    # The next thing to do is to choose how we are going to represent our
    # training matrix. Each review must be translated into a single vector.
    # This means we have to combine, somehow, the word vectors of each
    # word in the review. Some options are:
    #  - Take the average of all vectors.
    #  - Take the minimum and maximum value of each feature.
    # All these operations are vectorial and easier to compute using a GPU.
    # Then, it is better to put them inside the Keras model.

    # The Embedding layer will be quite handy in solving this problem for us.
    # To use this layer, the input to the network has to be the indices of the
    # words on the embedding matrix.
    
    
    X_train_vectorized = transform_input(X_train, filtered_fasttext.word2index)
    X_test_vectorized = transform_input(X_test, filtered_fasttext.word2index)


    #word_indices = []
    #for instance in X_train:
    #    word_indices.append([filtered_fasttext.word2index[word.decode('utf-8')]
    #                         for word in instance.split()])
    # Check consistency
    #assert len(X_train[0].split()) == len(word_indices[0])
    #word_indices = np.array(word_indices)
    
    # The input is ready, start the model
    model = Sequential()
    model.add(Embedding(
        filtered_fasttext.wv.shape[0],  # Vocabulary size
        filtered_fasttext.wv.shape[1],  # Embedding size
        weights=[filtered_fasttext.wv],  # Word vectors
        trainable=False  # This indicates the word vectors must not be changed
                         # during training.
    ))
    # The output here has shape
    #     (batch_size (?), words_in_reviews (?), embedding_size)
    # To use a Dense layer, the input must have only 2 dimensions. We need to
    # create a single representation for each document, combining the word
    # embeddings of the words in the intance.
    # For this, we have to use a Tensorflow (K) operation directly.
    # The operation we need to do is to take the average of the embeddings
    # on the second dimension. We wrap this operation on a Lambda
    # layer to include it into the model.
    model.add(Lambda(lambda xin: K.mean(xin, axis=1), name='embedding_average'))
    # Now the output shape is (batch_size (?), embedding_size)

    # TODO 2: Finish the Keras model
    # Add all the layers
    # ...
    # model.compile(...)
    #input_size = X_train.shape[1]
    #model.add(Dense(args.num_units[0], input_shape=(input_size,)))    

    model.add(Dense(args.num_units[0]))    
    model.add(Activation('relu'))
    
    model.add(Dropout(args.dropout[0]))    
    
    #---
    #model.add(Dense(args.num_units[0]))    
    #model.add(Activation('relu'))
    #---


    
    model.add(Dense(2))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer=optimizers.Adagrad(), 
                  metrics=['accuracy'])
    
    # Show model info on screen
    print_message('MODEL:', args )
    print_message( model.summary(), args )


    
    


    # TODO 3: Fit the model
    # hitory = model.fit(batch_size=??, ...)
    print_new_process('Fit:', args)
    history = model.fit(X_train_vectorized, y_train_cat,
                        batch_size=args.batch_size,
                        epochs=args.epochs,
                        shuffle=args.shuffle,
#                        validation_data=(X_test_vectorized, y_test_cat), 
                        verbose=1,
             )
    
    # TODO 4: Evaluate the model, calculating the metrics.
    # Option 1: Use the model.evaluate() method. For this, the model must be
    # already compiled with the metrics.
    # performance = model.evaluate(transform_input(X_test), y_test)

    # Option 2: Use the model.predict() method and calculate the metrics using
    # sklearn. We recommend this, because you can store the predictions if
    # you need more analysis later. Also, if you calculate the metrics on a
    # notebook, then you can compare multiple classifiers.
    # predictions = ...
    # performance = ...

    
    print_new_process('Predictions:',args)
    predictions = model.predict_classes(X_test_vectorized, verbose=1)
    
    if args.verbose == 1:
        display( str(list(predictions)))

    print_new_process('Test:',args)
    if args.verbose == 1:        
        display( str(list(y_test_orginal)))
 
    print_new_process('Performance:',args)
    score, accuracy = model.evaluate(X_test_vectorized, y_test_cat)
    print_message( '[score, accuracy]', args )
    print_message( [score, accuracy], args )
    
    # TODO 5: Save the results.
    # ...
    parameters_path = 'results/parameters/'
    if not os.path.exists(parameters_path):
        os.makedirs(parameters_path)
    predictions_path = 'results/predictions/'
    if not os.path.exists(predictions_path):
        os.makedirs(predictions_path)

    parameters_filename = 'results/parameters/parameters_{0}_acc_{1:.6f}.csv'.format( experiment_number, accuracy )
    print_new_process('Saving parameters: {}'.format(parameters_filename),args)  
    
    parameters_df = pandas.DataFrame(columns=['Parameter','Value'])
    for k,v in sorted(vars(args).items()):
        row = pandas.Series([str(k), str(v)], index=['Parameter', 'Value'])
        parameters_df = parameters_df.append(row,ignore_index=True)
    parameters_df.to_csv(parameters_filename, index=False )
    
    predictions_filename = 'results/predictions/predictions_{0}_acc_{1:.6f}.csv'.format( experiment_number, accuracy )
    print_new_process('Saving predictions: {}'.format(predictions_filename),args)  
    predictions_df = pandas.DataFrame(y_test_orginal, columns=['true_label'])
    predictions_df.loc[:, 'predicted'] = predictions
    predictions_df.to_csv( predictions_filename, index=False )
    
    print_message('Done.',args)    
    print_end('STOP.',args)    
    
    
    # One way to store the predictions:
    """
    results = pandas.DataFrame(y_test_orginal, columns=['true_label'])
    results.loc[:, 'predicted'] = predictions
    results.to_csv('predicitions_{}.csv'.format(args.experiment_name),
                   index=False)
    """



#if __name__ == '__main__':
#    main()



In [17]:
# cargamos argumentos de prueba
"""
arguments = ['--num_units=512',
             '--dropout=0.5',
             '--batch_size=100',             
             '--epochs=15',
             '--shuffle=batch',
             '--random_seed=10',
             '--verbose=1'
            ]
"""

arguments = ['--num_units=300',
             '--dropout=0.02',
             '--batch_size=100', 
             '--embeddings_filename=filtered_wiki-news-300d-50k.vec',
             '--epochs=15',
             '--shuffle=batch',
             '--random_seed=10',
             '--verbose=1'
            ]



# Ejecutamos el procedimiento principal
main()

Training samples 1500, test_samples 500
2018-09-22 09:02:06 - MODEL:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         15276000  
_________________________________________________________________
embedding_average (Lambda)   (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
activation_1 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 602       
_________________________________________________________________
activat

'[1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,

2018-09-22 09:02:19 ------------------------------------------------------------------------
2018-09-22 09:02:19 - Test:


'[1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,

2018-09-22 09:02:19 ------------------------------------------------------------------------
2018-09-22 09:02:19 - Performance:
2018-09-22 09:02:19 - [score, accuracy]
2018-09-22 09:02:19 - [0.6960780019760132, 0.5500000009536743]
2018-09-22 09:02:19 ------------------------------------------------------------------------
2018-09-22 09:02:19 - Saving parameters: results/parameters/parameters_20180922090205_acc_0.550000.csv
2018-09-22 09:02:19 ------------------------------------------------------------------------
2018-09-22 09:02:19 - Saving predictions: results/predictions/predictions_20180922090205_acc_0.550000.csv
2018-09-22 09:02:19 - Done.
2018-09-22 09:02:19 ------------------------------------------------------------------------
2018-09-22 09:02:19 - STOP.
2018-09-22 09:02:19 ------------------------------------------------------------------------
