# Analisis del archivo exercise_1.py:

Primera parte del archivo:

In [1]:
# Exercise 1
import argparse
import pandas
import numpy as np
import datetime
import os

from keras.models import Sequential
from tensorflow.python.keras.utils import np_utils   # for tf 1.3.1
#from tensorflow.python.keras import utils as np_utils     # for tf 1.4.1

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from itertools import repeat
from IPython.display import display, HTML

from printutils import print_message, print_new_process, print_end


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_args():
    parser = argparse.ArgumentParser(description='Exercise 1')
    # Here you have some examples of classifier parameters. You can add
    # more arguments or change these if you need to.
    parser.add_argument('--num_units', nargs='+', default=[100], type=int,
                        help='Number of hidden units of each hidden layer.')
    parser.add_argument('--dropout', nargs='+', default=[0.5], type=float,
                        help='Dropout ratio for every layer.')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Number of instances in each batch.')

    # New parameters:    
    parser.add_argument('--model', type=int, default=10, help='Number of model to run')
    parser.add_argument('--max_features', type=int, default=2000, help='Max number of words used inTfidfVectorizer')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs')
    parser.add_argument('--shuffle', type=str, default='batch', help='Shuffle value')
    parser.add_argument('--random_seed', type=int, default=10, help='Random seed number')
    parser.add_argument('--verbose', type=int, default=1, help='Verbose info on screen')
    
    # parse parameters
    if arguments == None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(arguments)

    assert len(args.num_units) == len(args.dropout)
    return args

# Apply the Tfidf vectorizer to create input matrix
Creamos un método para vectorizar la entrada:

In [3]:
 from sklearn.feature_extraction.text import TfidfVectorizer
    
# TODO 1: Apply the Tfidf vectorizer to create input matrix
def vectorize_input(x_train, x_test,args):

    vectorizer = TfidfVectorizer(analyzer='word', use_idf=True, max_features=args.max_features)
    
    x_train_vec = vectorizer.fit_transform(x_train).toarray()
    print_message('x_train_vec - type: {}, shape:{}'.format(type(x_train_vec),x_train_vec.shape),args)
    
    x_test_vec = vectorizer.fit_transform(x_test).toarray()
    print_message('x_test_vec - type: {}, shape:{}'.format(type(x_test_vec),x_test_vec.shape),args)
      
    return x_train_vec, x_test_vec

#### Funcion: load_dataset():         
La misma función del archivo, donde llamamos a nuestra nueva rutina:

In [4]:
def load_dataset(args):
    
    print_new_process('Load and vectorize Data:',args)
    
    dataset = load_files('dataset/txt_sentoken', shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=42)
    
    print_message('Training samples {}, test_samples {}'.format(len(X_train), len(X_test)), args)

    # TODO 1: Apply the Tfidf vectorizer to create input matrix
    x_train_vec, x_test_vec = vectorize_input(X_train, X_test,args)
        
    return x_train_vec, x_test_vec, y_train, y_test

## Build the Keras models:

### Model 1: 

In [5]:
from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout
from keras import optimizers, regularizers

def build_keras_model_1( x_train_vec, args ):
    
    input_size = x_train_vec.shape[1]
       
    model = Sequential()
    model.add(Dense( args.num_units[0], input_shape=(input_size,)))    
    model.add(Activation('relu'))
    
    model.add(Dropout(args.dropout[0]))    
    
    model.add(Dense(2))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer=optimizers.Adagrad(), 
                  metrics=['accuracy'])
    
    # Show model info on screen
    print_message('MODEL 1:', args )
    print_message( model.summary(), args )

    return model

### Model 2: 

In [6]:
def build_keras_model_2( x_train_vec, args ):
    
    input_size = x_train_vec.shape[1]
       
    model = Sequential()

    # ...
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer=optimizers.Adagrad(), 
                  metrics=['accuracy'])
    
    # Show model info on screen
    print_message('MODEL 2:', args )
    print_message( model.summary() )

    return model

### Model 3:

In [7]:
def build_keras_model_3( x_train_vec, args ):
    
    input_size = x_train_vec.shape[1]
       
    model = Sequential()

    # ...
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer=optimizers.Adagrad(), 
                  metrics=['accuracy'])
    
    # Show model info on screen
    print_message('MODEL 3:', args )
    print_message( model.summary() )

    return model

## Main:

Ajustamos main para llamar a las rutinas anteriores:

In [8]:
def main():

    experiment_number = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    
    args = read_args()
    
    print_new_process('START:', args )
    
    # Configuramos la semilla randomica para reproductibilidad
    np.random.seed(args.random_seed)
    
    # Cargamos el dataset
    x_train_vec, x_test_vec, y_train, y_test_orginal = load_dataset(args)

    # TODO 2: Convert the labels to categorical
    num_classes = 2 
    y_train_cat = np_utils.to_categorical(y_train, num_classes)    
    y_test_cat = np_utils.to_categorical(y_test_orginal, num_classes)    
    
    print_new_process('Build Model:', args )
    # TODO 3: Build the Keras model
    switcher = {
        1: build_keras_model_1,
        2: build_keras_model_2,
        3: build_keras_model_3,
    }
    # Get the function from switcher dictionary
    model_builder = switcher.get(args.model, lambda: "nothing")
    
    model = model_builder( x_train_vec, args )
    
    # TODO 4: Fit the model    
    print_new_process('Fit:', args)
    history = model.fit( x_train_vec, y_train_cat,
                        batch_size=args.batch_size,
                        epochs=args.epochs,
                        shuffle=args.shuffle,
                        verbose=1,
             )

    # TODO 5: Evaluate the model, calculating the metrics.
    # Option 1: Use the model.evaluate() method. For this, the model must be
    # already compiled with the metrics.
    
    print_new_process('Predictions:',args)
    predictions = model.predict_classes(x_test_vec, verbose=1)
    
    if args.verbose == 1:
        print( predictions )

    print_new_process('Test:',args)
    if args.verbose == 1:        
        print( np.reshape(y_test_cat, -1).astype(int) )
    
    print_new_process('Performance:',args)
    score, accuracy = model.evaluate(x_test_vec, y_test_cat)
    print_message( '[score, accuracy]', args )
    print_message( [score, accuracy], args )

    # Option 2: Use the model.predict() method and calculate the metrics using
    # sklearn. We recommend this, because you can store the predictions if
    # you need more analysis later. Also, if you calculate the metrics on a
    # notebook, then you can compare multiple classifiers.
    # predictions = ...
    # performance = ...

    # TODO 6: Save the results.    
    parameters_path = 'results/parameters/'
    if not os.path.exists(parameters_path):
        os.makedirs(parameters_path)
    predictions_path = 'results/predictions/'
    if not os.path.exists(predictions_path):
        os.makedirs(predictions_path)

    parameters_filename = 'results/parameters/parameters_{0}_acc_{1:.6f}.csv'.format( experiment_number, accuracy )
    print_new_process('Saving parameters: {}'.format(parameters_filename),args)  
    
    parameters_df = pandas.DataFrame(columns=['Parameter','Value'])
    for k,v in sorted(vars(args).items()):
        row = pandas.Series([str(k), str(v)], index=['Parameter', 'Value'])
        parameters_df = parameters_df.append(row,ignore_index=True)
    parameters_df.to_csv(parameters_filename, index=False )
    
    predictions_filename = 'results/predictions/predictions_{0}_acc_{1:.6f}.csv'.format( experiment_number, accuracy )
    print_new_process('Saving predictions: {}'.format(predictions_filename),args)  
    predictions_df = pandas.DataFrame(y_test_orginal, columns=['true_label'])
    predictions_df.loc[:, 'predicted'] = predictions
    predictions_df.to_csv( predictions_filename, index=False )
    
    print_message('Done.',args)    
    print_end('STOP.',args)    

<br/>

# Test de Main:

Ejecutamos una prueba de main para poder visualizar que el funcionamiento sea adecuado:

In [9]:
# cargamos argumentos de prueba
arguments = ['--model=1',
             '--max_features=2000',
             '--num_units=512',
             '--dropout=0.5',
             '--batch_size=100',             
             '--epochs=15',
             '--shuffle=batch',
             '--random_seed=10',
             '--verbose=1'
            ]

# Ejecutamos el procedimiento principal
main()

2018-09-21 07:29:30 ------------------------------------------------------------------------
2018-09-21 07:29:30 - START:
2018-09-21 07:29:30 ------------------------------------------------------------------------
2018-09-21 07:29:30 - Load and vectorize Data:
2018-09-21 07:29:31 - Training samples 1500, test_samples 500
2018-09-21 07:29:34 - x_train_vec - type: <class 'numpy.ndarray'>, shape:(1500, 2000)
2018-09-21 07:29:35 - x_test_vec - type: <class 'numpy.ndarray'>, shape:(500, 2000)
2018-09-21 07:29:35 ------------------------------------------------------------------------
2018-09-21 07:29:35 - Build Model:
2018-09-21 07:29:35 - MODEL 1:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               1024512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________