# Setting Up

## Conexión a google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd ./drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [3]:
%pwd

'/content/drive/MyDrive/Colab Notebooks'

## Importar los módulos *_utils

In [12]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')
from my_utils import dataset_utils
from my_utils import eval_utils
from my_utils import nn_utils

In [13]:
import pickle

# Local Utils

### fcNeuralNetworkModel()

In [14]:
from tensorflow import keras
from tensorflow.keras import layers

def fcNeuralNetworkModel(task, params, optimizer, verbose = False):
    """
    Defines and compiles a new Keras model for a Fully Connected Neural Network
    with 1 hidden layer (a total of 3 layers) and dropout regularization.
    The number of units per layer, the activation function, the regularization 
    rate and the are given by the params dictionary.

    """
    VEC_EMBEDDINGS_DIM = 300
    activation_f = params['activation']

    input_vector = keras.Input(name='INPUT', shape=(VEC_EMBEDDINGS_DIM), dtype="float64")

    x  = layers.Dense(units=params['L1_size'], 
                              activation=activation_f, 
                              name = 'DL_1')(input_vector)

    x = layers.Dropout(rate=params['p_dropout_1'], name = 'DROPOUT_1')(x)                              

    x = layers.Dense(units=params['L2_size'], 
                              activation=activation_f, 
                              name = 'DL_2')(x)                          

    x = layers.Dropout(rate=params['p_dropout_2'], name = 'DROPOUT_2')(x)

    if task in ['HS', 'TR', 'AG']:
      preds = layers.Dense(1, activation="sigmoid", name = 'PREDICTOR')(x)
      model = keras.Model(input_vector, preds)

      model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["acc"])

    elif task == 'HTA':
      preds = layers.Dense(5, activation="softmax", name = 'PREDICTOR')(x)
      preds = reshape = layers.Reshape(target_shape=(5,))(preds)

      model = keras.Model(input_vector, preds)
      model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])

    if verbose:
      model.summary()

    return model

# External Utils

### merged cells

In [None]:
# **************************   spaceScanner()   ************************** 

# adapted from Talos solution
# (check: https://autonomio.github.io/talos/#/)
import pickle
from itertools import product
from random import sample
from pandas import DataFrame
from numpy import random as np_random

def spaceScanner(X_train, Y_train, task, model_prototype, search_space, arch_params_keys, 
                training_params_keys, fraction2eval, fitting_attemps, n_folds, stop_threshold=0.99, 
                partial_CV=False, backup_file='testing_file.df', backup_freq=25, 
                save_history_files=False, save_models_as_json=False):
  """
  Evaluates a sample of the configurations defined in the search space.

  inputs:
  X_train               - list[numpy_array], encoded folds with the train data
  Y_train               - DataFrame, labels (Y_train.columns = ['HS','TR','AG','HTA','kfold'])
  task                  - str, valid_task_options = ['HS','TR','AG','HTA']
  model_prototype       - function, model prototype definition (returns compiled keras model)
  search_space          - list[tuple], combinations of parameters to be evaluated 
                          (Every entry in the search space is a two elements tuple, 
                          the first of this elements is a list of architecture_params and the 
                          second one is a list of training_params)
  arch_params_keys      - list[str], arch_params_keys
  training_params_keys  - list[str], training_params_keys
  evaluation_mode       - int, eithe 1 or 2. See footnote for a description about how each one of
                          this works
  fitting_attemps       - int, try fitting the model N times and keep the best result (N=fitting_attemps)
  n_folds               - int, number of folds for cross validation
  partial_CV            - boolean, partial_CV=True means that the model should only be evaluated 
                          in half of the n_folds
  backup_file           - str, name of the file where the experiment results will ve saved
                          (The filename should be passed without extension. A .df extension will be appended)
  backup_freq           - int, how often will the eperiment results be backed-up
  ---------


  outputs:
  results_df            - DataFrame, one row for every configuration evaluated

  """
  print(f'BackupFile: {backup_file}')
  print('SCANNING SEARCH SPACE\n')

  # take a sample of the search_space
  if fraction2eval < 1:
    sample_size = int(len(search_space)*fraction2eval)
    # space_sample=sample(search_space, sample_size)
  else: 
    sample_size = len(search_space)
    # space_sample=search_space

  """
  #When using space samples
  # transform the parameters-tuples into parameters-dictionaries
  space_sample = [[dict( zip(arch_params_keys, params_combo[0]) ),
                   dict( zip(training_params_keys, params_combo[1]) )] for params_combo in space_sample]
  """

  # when using random variable
  search_space = [[dict( zip(arch_params_keys, params_combo[0]) ),
                   dict( zip(training_params_keys, params_combo[1]) )] for params_combo in search_space]

  # validate the configurations in the search_space_sample
  results_list = []

  for config_idx, conf_params in enumerate(search_space):
    # use a binomial random variable to decide weatther to test the 
    # current configuration or pass. The parameter p of the binomial 
    # experiment is iqual to fraction2eval
    random_v = np_random.binomial(1,fraction2eval)
    if random_v == 0:
      continue


    # get a ramdom conf_ID
    conf_ID = get_random_string(8)

    # print a configurations count
    #counter_str = '{}/{}'.format(config_count + 1, sample_size).ljust(9,' ') 
    #print(counter_str, end = '')
    print(f'conf_ID: {conf_ID}   ', end = '')

    # evaluate the current config using croos validation
    arch_params, training_params = conf_params

    validation_results = modelCrossValidation(X_train, Y_train,
                                             task,  
                                             model_prototype,
                                             arch_params,
                                             training_params,
                                             n_folds,
                                             fitting_attemps,
                                             partial_CV,
                                             conf_ID,
                                             save_history_files,
                                             save_models_as_json)

    # update the results list
    results_list.append(validation_results)

    # display progress and results
    train_acc_A = validation_results['train_acc_A']
    val_acc_A   = validation_results['val_acc_A']
    train_acc_B = validation_results['train_acc_B']
    val_acc_B   = validation_results['val_acc_B']

    #progress_string    = '[aprox. {}% complete]'.format(round(100*(config_count+1)/sample_size,2)).ljust(20, ' ')

    train_acc_A_str = 'train_acc_A = {},'.format(round(train_acc_A,3)).ljust(20,' ')
    val_acc_A_str   = 'val_acc_A = {}'.format(round(val_acc_A,3)).ljust(20,' ')
    train_acc_B_str = 'train_acc_B = {},'.format(round(train_acc_B,3)).ljust(20,' ')
    val_acc_B_str   = 'val_acc_B = {}'.format(round(val_acc_B,3)).ljust(20,' ')

    print('--  {}  {} - {}  {}'.format(train_acc_A_str, 
                                val_acc_A_str,
                                train_acc_B_str,
                                val_acc_B_str))

    # save results every X configurations
    if len(results_list)%backup_freq == 0:
      partial_backup_file = f'{backup_file}.partial'
      with open(partial_backup_file, 'wb') as file_handler:
        pickle.dump(DataFrame(results_list), file_handler)

  # final backup
  results_df = DataFrame(results_list)

  complete_backup_file = backup_file
  with open(complete_backup_file, 'wb') as file_handler:
    pickle.dump(results_df, file_handler)

  print(f'\nDONE IS COMPLETE. {len(results_list)} CONFIGURATIONS WERE SUCCESFULLY EVALUATED.')

  return results_df


# **************************   modelCrossValidation()   ************************** 

def modelCrossValidation(X_train,Y_train,task,model_prototype,arch_params,training_params,
                         n_folds,fitting_attemps,partial_CV,configuration_ID,save_history_files,
                         save_model_as_json):

    """
    Evaluates a model configuration using cross validation. 

    inputs:
    X_train               - list[numpy_array], encoded folds with the train data
    Y_train               - DataFrame, labels (Y_train.columns = ['HS','TR','AG','HTA','kfold'])
    task                  - str, valid_task_options = ['HS','TR','AG','HTA']
    model_prototype       - function obj, model prototype definition (returns compiled keras model)
    arch_params           - dict, arch_params_keys
    training_params       - dict, training_params_keys
    n_folds               - int, number of folds for cross validation
    fitting_attemps       - int, the model should be fitted N=fitting_attemps times for every fold
    partial_CV            - boolean, partial_CV=True means that the model should only be evaluated 
                            in half of the folds
    configuration_ID      - str, 8 char ID
    save_history_files    - boolean, should history_files be backed up or not
    save_model_as_json    - boolean

    
    outputs:
    evaluation_results_dict   - dict, summarizes the results of the cross validations             

    """

    save_trained_models = False

    # unpack the training parameters
    optimizer_id = training_params['optimizer']
    max_epochs = training_params['max_epochs']
    batch_size = training_params['batch_size']

    # instantiate the corresponding optimizer
    optimizer    = optimizers_list[optimizer_id]

    if save_model_as_json:
      model = model_prototype(task, arch_params, optimizer)
      model_json = model.to_json()
      with open(f"./models_json_files/{configuration_ID}.json", "w") as json_file:
        json_file.write(model_json)

    # The main work ------------------------------------------------------------
    histories_list = list()   # list[dict]

    for fold_idx, data_fold in enumerate(getDataSplitsGenerator(X_train, Y_train, n_folds)):
      
      # When partial_CV==True just evaluate the model in half of the n-folds
      if (partial_CV) and (fold_idx%2==1):
        continue

      # Run N attemps to fit the model (N=fitting_attemps)
      # save the model_history of the model with the best val_acc metric
      best_try_history = None
      max_acc = 0
      for i in range(fitting_attemps):

        model = model_prototype(task, arch_params, optimizer)

        acc, model_history = fitNeuralNetworkModel(model, task, (data_fold), batch_size, 
                                                   max_epochs)

        if acc > max_acc:
          max_acc = acc
          best_attemp = i
          best_try_history = model_history

      # append the best_try_history to the list of histories
      histories_list.append(best_try_history)
      
      # print a hint at the end of every fold validation
      print('*', end='')

    # compute a summary of the metrics
    metrics_summary = getMetricsSummary(histories_list)

    # form a dictionary to report the validation results
    validation_results_dict = {'conf_ID':configuration_ID}
    validation_results_dict.update(arch_params)
    validation_results_dict.update(training_params)
    validation_results_dict.update(metrics_summary)

    # save the list of histories for future reference
    if save_history_files:
        with open(f'./history_files/{configuration_ID}.dict', 'wb') as file_handler:
            pickle.dump(histories_list, file_handler)
  
    print('  ', end='')

    return validation_results_dict


# **************************   fitNeuralNetworkModel()   **************************

def fitNeuralNetworkModel(model, task, data, batch_size, max_epochs, verbose=0):
  """
  Fits a precompiled keras model to the given data. 

  inputs:
  model           - compiled keras model
  task            - str, ['HS','TR', 'AG', 'HTA']
  data            - tuple, (x_train, y_train, x_val, y_val)
  batch_size      - int 
  max_epochs      - int
  stop_threshold  - float, 0 < stop_threshold < 1
  
  --
  x_train         - numpy_array [shape = N_t, ENCODDING_DIM]
  y_train         - numpy_array [shape = N_t, 1]
  x_val           - numpy_array [shape = N_v, ENCODDING_DIM]
  y_val           - numpy_array [shape = N_v, 1]

  outputs:
  max_acc         - float, max_val_acc registered in model_history
  model_history   - dict, model.history.history

  """

  # Callbacks
  """
  stopping_condition = MyStoppingCallback(1)
  callbacks_list = [stopping_condition]

  if cp_filepath != 'null_path':
    checkpoint = tf.keras.callbacks.ModelCheckpoint(cp_filepath, 
                                                    monitor='val_acc', 
                                                    verbose=0, 
                                                    save_best_only=True, 
                                                    mode='max')
    
    callbacks_list.append(checkpoint)
  """

  # Main work
  x_train, y_train, x_val, y_val = data 

  # Extract the labels that correspond to the given task
  # type(y_train) = type(y_val) = pandas.DataFrame
  y_train = y_train[task]
  y_val   = y_val[task]

  # For multiclass classification we need to transform 
  # the labels to a one-hot-encoding representation
  if task == 'HTA':
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=5)
    y_val = tf.keras.utils.to_categorical(y_val, num_classes=5)     

  model.fit(x=x_train, 
            y=y_train,
            validation_data=(x_val, y_val),
            epochs=max_epochs,
            batch_size=batch_size,
            verbose=verbose)
  
  max_acc = max(model.history.history['val_acc'])

  return max_acc, model.history.history


# **************************   getMetricsSummary()   ************************** 
from statistics import mean

def getMetricsSummary(hist_list):
    """
    Returns a summary of the metrics registered in the histories_list.

    inputs:
    hist_list       - list[model.history.history], list of models histories 

    output:
    metrics_summary - dict, keys = [train_acc_A, val_acc_A, train_acc_B, val_acc_B] 

    #### Two methosd to evaluate every configuration:

    #### Method A:
        1) Compute the mean-val-acc for every epoch
        2) Let E be the epoch with the hihgest mean-val-acc 
        3) Report both the mean-val-acc and mean-train-acc at epoch E

    #### Method B:
        1) Let E(k) be the epoch with the highest val-acc recorded for fold k
        1) Let val_acc(E(k)) and train_acc(E(k)) be the metrics regitered for fold k at the epoch E(K)
        2) Report the mean values of both val_acc(k) and acc(k)


    """   

    best_epoch, train_acc_A, val_acc_A = method_A_Metrics(hist_list)
    min2best, max2best, train_acc_B, val_acc_B = method_B_Metrics(hist_list)

    return {'best_epochh':best_epoch,
            'train_acc_A':train_acc_A, 
            'val_acc_A':val_acc_A,
            'min2best':min2best,
            'max2best':max2best,
            'train_acc_B':train_acc_B,
            'val_acc_B':val_acc_B
            }


# **************************   computeGlobalMetrics()   **************************

def method_A_Metrics(histories_list):
    # Method A results:
    mean_val_acc  = list()     
    mean_train_acc = list() 

    # Find M (the largest mean-val-acc)
    max_mean_val_acc = 0
    acc_zip = list(zip(*[h['acc'] for h in histories_list]))
    val_acc_zip = list(zip(*[h['val_acc'] for h in histories_list]))

    n_epochs = len(acc_zip)
    best_epoch_idx = 0
    for i in range(n_epochs):
        mean_train_acc.append(mean(acc_zip[i]))
        mean_val_acc.append(mean(val_acc_zip[i]))

        if mean_val_acc[-1] > max_mean_val_acc:
            max_mean_val_acc = mean_val_acc[-1]
            best_epoch_idx = i

    """
    DELTA = 0  
    DELTA = .5/100

    if DELTA > 0:
        # Try to find an epoch with a smaller stdev
        # Use M to pick the best epoch
        best_epoch_val_stdev = 1
        best_epoch_idx = 0

        for i in range(n_epochs):
            val_acc = mean_val_acc[i]
            val_stdev = val_acc_stdev[i]

            if (val_acc >= max_val_acc - DELTA) & (val_stdev < best_epoch_val_stdev):
                best_epoch_val_stdev = val_stdev
                best_epoch_idx = i
    """

    return best_epoch_idx, mean_train_acc[best_epoch_idx], mean_val_acc[best_epoch_idx]

def method_B_Metrics(histories_list):
    # Method B results:
    val_acc_results  = []
    train_acc_results = []

    best_epochs_list = []
    for h in histories_list:
        val_acc_history = h['val_acc']
        train_acc_history = h['acc']

        fold_max_val_acc = max(val_acc_history)
        fold_best_epoch = val_acc_history.index(fold_max_val_acc)
        best_epochs_list.append(fold_best_epoch)

        val_acc_results.append(fold_max_val_acc)
        train_acc_results.append(train_acc_history[fold_best_epoch])

    min2best = min(best_epochs_list)
    max2best = max(best_epochs_list)

    return min2best, max2best, mean(train_acc_results), mean(val_acc_results)


# **************************   getDataSplitsGenerator()   **************************
from numpy import concatenate as np_concatenate

def getDataSplitsGenerator(X_train, Y_train, n_folds):
  '''
  Splits train data into K folds for cross validation. (K=n_folds)

  inputs:
  X_train   - list[numpy_array], encoded folds
  Y_train   - DataFrame, containing the labels for the training instances 
  n_folds   - int, number of folds for cross validation

  -------
  inputs preconditions:
  len(X_train) == n_folds

  outputs:
  (x_train, y_train, x_val, y_val) - generator with K different splits of the train data

  '''

  for k in range(n_folds):
  
    # DATA
    x_train = np_concatenate([X_train[i] for i in range(n_folds) if i not in [k]], axis=0)
    x_val   = X_train[k]

    # LABELS
    train_mask = Y_train['kfold'] != k
    val_mask = Y_train['kfold'] == k

    y_train = Y_train.loc[train_mask, :]
    y_val   = Y_train.loc[val_mask, :]

    yield (x_train, y_train, x_val, y_val)


# **************************   get_random_string()   **************************
from random import choice
import string

def get_random_string(length):
    # choose from all lowercase+uppercase letters
    letters = string.ascii_lowercase + string.ascii_uppercase
    result_str = ''.join(choice(letters) for i in range(length))

    return result_str    

### Miscelaneous

In [None]:
# **************************   getDataSplitsGenerator()   **************************
from numpy import concatenate as np_concatenate

def getDataSplitsGenerator(X_train, Y_train, n_folds):
  '''
  Splits train data into K folds for cross validation. (K=n_folds)

  inputs:
  X_train   - list[numpy_array], encoded folds
  Y_train   - DataFrame, containing the labels for the training instances 
  n_folds   - int, number of folds for cross validation

  -------
  inputs preconditions:
  len(X_train) == n_folds

  outputs:
  (x_train, y_train, x_val, y_val) - generator with K different splits of the train data

  '''

  for k in range(n_folds):
  
    # DATA
    x_train = np_concatenate([X_train[i] for i in range(n_folds) if i not in [k]], axis=0)
    x_val   = X_train[k]

    # LABELS
    train_mask = Y_train['kfold'] != k
    val_mask = Y_train['kfold'] == k

    y_train = Y_train.loc[train_mask, :]
    y_val   = Y_train.loc[val_mask, :]

    yield (x_train, y_train, x_val, y_val)


# **************************   get_random_string()   **************************
from random import choice
import string

def get_random_string(length):
    # choose from all lowercase+uppercase letters
    letters = string.ascii_lowercase + string.ascii_uppercase
    result_str = ''.join(choice(letters) for i in range(length))

    return result_str    

### optimizers_list

In [None]:
import tensorflow.keras.optimizers as keras_optimizers

optimizers_list = {'adam-1e-3':keras_optimizers.Adam(learning_rate=0.001),                          #adam_1
              'adam-7e-4':keras_optimizers.Adam(learning_rate=0.0007),
              'adam-5e-4':keras_optimizers.Adam(learning_rate=0.0005),
              'adam-3e-4':keras_optimizers.Adam(learning_rate=0.0003),
              'adam-1e-4':keras_optimizers.Adam(learning_rate=0.0001),
              'rmsprop-1e-3':keras_optimizers.RMSprop(learning_rate=0.001, momentum=0.0),           # rmsprop_1
              'rmsprop-7e-4':keras_optimizers.RMSprop(learning_rate=0.0007, momentum=0.0),
              'rmsprop-5e-4':keras_optimizers.RMSprop(learning_rate=0.0005, momentum=0.0),
              'rmsprop-3e-4':keras_optimizers.RMSprop(learning_rate=0.0003, momentum=0.0),
              'rmsprop-1e-4':keras_optimizers.RMSprop(learning_rate=0.0001, momentum=0.0),
              'rmsprop-7.5e-5':keras_optimizers.RMSprop(learning_rate=0.000075, momentum=0.0),      # rmsprop_7
              'rmsprop-5e-5':keras_optimizers.RMSprop(learning_rate=0.00005, momentum=0.0),
              'rmsprop-1e-3-mu0.9':keras_optimizers.RMSprop(learning_rate=0.001, momentum=0.9),
              'rmsprop-7e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0007, momentum=0.9),
              'rmsprop-5e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0005, momentum=0.9),
              'rmsprop-3e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0003, momentum=0.9),
              'rmsprop-1e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0001, momentum=0.9),
              'rmsprop-7.5e-5-mu0.9':keras_optimizers.RMSprop(learning_rate=0.000075, momentum=0.9), 
              'rmsprop-5e-5-mu0.9':keras_optimizers.RMSprop(learning_rate=0.00005, momentum=0.9)}    # rmsprop_7

### spaceScanner()

In [None]:
# **************************   spaceScanner()   ************************** 

# adapted from Talos solution
# (check: https://autonomio.github.io/talos/#/)
from itertools import product
from random import sample
from pandas import DataFrame
import pickle
from numpy import random as np_random

def spaceScanner(X_train, Y_train, task, model_prototype, search_space, arch_params_keys, 
                training_params_keys, fraction2eval, fitting_attemps, n_folds, stop_threshold=0.99, 
                partial_CV=False, backup_file='testing_file.df', backup_freq=25, 
                save_history_files=False, save_models_as_json=False):
  """
  Evaluates a sample of the configurations defined in the search space.

  inputs:
  X_train               - list[numpy_array], encoded folds with the train data
  Y_train               - DataFrame, labels (Y_train.columns = ['HS','TR','AG','HTA','kfold'])
  task                  - str, valid_task_options = ['HS','TR','AG','HTA']
  model_prototype       - function, model prototype definition (returns compiled keras model)
  search_space          - list[tuple], combinations of parameters to be evaluated 
                          (Every entry in the search space is a two elements tuple, 
                          the first of this elements is a list of architecture_params and the 
                          second one is a list of training_params)
  arch_params_keys      - list[str], arch_params_keys
  training_params_keys  - list[str], training_params_keys
  evaluation_mode       - int, eithe 1 or 2. See footnote for a description about how each one of
                          this works
  fitting_attemps       - int, try fitting the model N times and keep the best result (N=fitting_attemps)
  n_folds               - int, number of folds for cross validation
  partial_CV            - boolean, partial_CV=True means that the model should only be evaluated 
                          in half of the n_folds
  backup_file           - str, name of the file where the experiment results will ve saved
                          (The filename should be passed without extension. A .df extension will be appended)
  backup_freq           - int, how often will the eperiment results be backed-up
  ---------


  outputs:
  results_df            - DataFrame, one row for every configuration evaluated

  """
  print(f'BackupFile: {backup_file}')
  print('SCANNING SEARCH SPACE\n')

  # take a sample of the search_space
  if fraction2eval < 1:
    sample_size = int(len(search_space)*fraction2eval)
    # space_sample=sample(search_space, sample_size)
  else: 
    sample_size = len(search_space)
    # space_sample=search_space

  """
  #When using space samples
  # transform the parameters-tuples into parameters-dictionaries
  space_sample = [[dict( zip(arch_params_keys, params_combo[0]) ),
                   dict( zip(training_params_keys, params_combo[1]) )] for params_combo in space_sample]
  """

  # when using random variable
  search_space = [[dict( zip(arch_params_keys, params_combo[0]) ),
                   dict( zip(training_params_keys, params_combo[1]) )] for params_combo in search_space]

  # validate the configurations in the search_space_sample
  results_list = []

  for config_idx, conf_params in enumerate(search_space):
    # use a binomial random variable to decide weatther to test the 
    # current configuration or pass. The parameter p of the binomial 
    # experiment is iqual to fraction2eval
    random_v = np_random.binomial(1,fraction2eval)
    if random_v == 0:
      continue


    # get a ramdom conf_ID
    conf_ID = get_random_string(8)

    # print a configurations count
    #counter_str = '{}/{}'.format(config_count + 1, sample_size).ljust(9,' ') 
    #print(counter_str, end = '')
    print(f'conf_ID: {conf_ID}   ', end = '')

    # evaluate the current config using croos validation
    arch_params, training_params = conf_params

    validation_results = modelCrossValidation(X_train, Y_train,
                                             task,  
                                             model_prototype,
                                             arch_params,
                                             training_params,
                                             n_folds,
                                             fitting_attemps,
                                             partial_CV,
                                             conf_ID,
                                             save_history_files,
                                             save_models_as_json)

    # update the results list
    results_list.append(validation_results)

    # display progress and results
    train_acc_A = validation_results['train_acc_A']
    val_acc_A   = validation_results['val_acc_A']
    train_acc_B = validation_results['train_acc_B']
    val_acc_B   = validation_results['val_acc_B']

    #progress_string    = '[aprox. {}% complete]'.format(round(100*(config_count+1)/sample_size,2)).ljust(20, ' ')

    train_acc_A_str = 'train_acc_A = {},'.format(round(train_acc_A,3)).ljust(20,' ')
    val_acc_A_str   = 'val_acc_A = {}'.format(round(val_acc_A,3)).ljust(20,' ')
    train_acc_B_str = 'train_acc_B = {},'.format(round(train_acc_B,3)).ljust(20,' ')
    val_acc_B_str   = 'val_acc_B = {}'.format(round(val_acc_B,3)).ljust(20,' ')

    print('--  {}  {} - {}  {}'.format(train_acc_A_str, 
                                val_acc_A_str,
                                train_acc_B_str,
                                val_acc_B_str))

    # save results every X configurations
    if len(results_list)%backup_freq == 0:
      partial_backup_file = f'{backup_file}.partial'
      with open(partial_backup_file, 'wb') as file_handler:
        pickle.dump(DataFrame(results_list), file_handler)

  # final backup
  results_df = DataFrame(results_list)

  complete_backup_file = backup_file
  with open(complete_backup_file, 'wb') as file_handler:
    pickle.dump(results_df, file_handler)

  print(f'\nDONE IS COMPLETE. {len(results_list)} CONFIGURATIONS WERE SUCCESFULLY EVALUATED.')

  return results_df

In [None]:
len('[aprox. 100.00% complete]')

25

### modelCrossValidation()

In [None]:
# **************************   modelCrossValidation()   ************************** 

import os

import tensorflow.keras.optimizers as keras_optimizers

def modelCrossValidation(X_train,Y_train,task,model_prototype,arch_params,training_params,
                         n_folds,fitting_attemps,partial_CV,configuration_ID,save_history_files,
                         save_model_as_json):

    """
    Evaluates a model configuration using cross validation. 

    inputs:
    X_train               - list[numpy_array], encoded folds with the train data
    Y_train               - DataFrame, labels (Y_train.columns = ['HS','TR','AG','HTA','kfold'])
    task                  - str, valid_task_options = ['HS','TR','AG','HTA']
    model_prototype       - function obj, model prototype definition (returns compiled keras model)
    arch_params           - dict, arch_params_keys
    training_params       - dict, training_params_keys
    n_folds               - int, number of folds for cross validation
    fitting_attemps       - int, the model should be fitted N=fitting_attemps times for every fold
    partial_CV            - boolean, partial_CV=True means that the model should only be evaluated 
                            in half of the folds
    configuration_ID      - str, 8 char ID
    save_history_files    - boolean, should history_files be backed up or not
    save_model_as_json    - boolean

    
    outputs:
    evaluation_results_dict   - dict, summarizes the results of the cross validations             

    """

    save_trained_models = False

    # unpack the training parameters
    optimizer_id = training_params['optimizer']
    max_epochs = training_params['max_epochs']
    batch_size = training_params['batch_size']

    # instantiate the corresponding optimizer
    optimizer    = optimizers_list[optimizer_id]

    if save_model_as_json:
      model = model_prototype(task, arch_params, optimizer)
      model_json = model.to_json()
      with open(f"./models_json_files/{configuration_ID}.json", "w") as json_file:
        json_file.write(model_json)

    # The main work ------------------------------------------------------------
    histories_list = list()   # list[dict]

    for fold_idx, data_fold in enumerate(getDataSplitsGenerator(X_train, Y_train, n_folds)):
      
      # When partial_CV==True just evaluate the model in half of the n-folds
      if (partial_CV) and (fold_idx%2==1):
        continue

      # Run N attemps to fit the model (N=fitting_attemps)
      # save the model_history of the model with the best val_acc metric
      best_try_history = None
      max_acc = 0
      for i in range(fitting_attemps):

        model = model_prototype(task, arch_params, optimizer)

        acc, model_history = fitNeuralNetworkModel(model, task, (data_fold), batch_size, 
                                                   max_epochs)

        if acc > max_acc:
          max_acc = acc
          best_attemp = i
          best_try_history = model_history

      # append the best_try_history to the list of histories
      histories_list.append(best_try_history)
      
      # print a hint at the end of every fold validation
      print('*', end='')

    # compute a summary of the metrics
    metrics_summary = getMetricsSummary(histories_list)

    # form a dictionary to report the validation results
    validation_results_dict = {'conf_ID':configuration_ID}
    validation_results_dict.update(arch_params)
    validation_results_dict.update(training_params)
    validation_results_dict.update(metrics_summary)

    # save the list of histories for future reference
    if save_history_files:
        with open(f'./history_files/{configuration_ID}.dict', 'wb') as file_handler:
            pickle.dump(histories_list, file_handler)
  
    print('  ', end='')

    return validation_results_dict

### fitNeuralNet()

In [None]:
# **************************   fitNeuralNetworkModel()   **************************

def fitNeuralNetworkModel(model, task, data, batch_size, max_epochs, verbose=0):
  """
  Fits a precompiled keras model to the given data. 

  inputs:
  model           - compiled keras model
  task            - str, ['HS','TR', 'AG', 'HTA']
  data            - tuple, (x_train, y_train, x_val, y_val)
  batch_size      - int 
  max_epochs      - int
  stop_threshold  - float, 0 < stop_threshold < 1
  
  --
  x_train         - numpy_array [shape = N_t, ENCODDING_DIM]
  y_train         - numpy_array [shape = N_t, 1]
  x_val           - numpy_array [shape = N_v, ENCODDING_DIM]
  y_val           - numpy_array [shape = N_v, 1]

  outputs:
  max_acc         - float, max_val_acc registered in model_history
  model_history   - dict, model.history.history

  """

  # Callbacks
  """
  stopping_condition = MyStoppingCallback(1)
  callbacks_list = [stopping_condition]

  if cp_filepath != 'null_path':
    checkpoint = tf.keras.callbacks.ModelCheckpoint(cp_filepath, 
                                                    monitor='val_acc', 
                                                    verbose=0, 
                                                    save_best_only=True, 
                                                    mode='max')
    
    callbacks_list.append(checkpoint)
  """

  # Main work
  x_train, y_train, x_val, y_val = data 

  # Extract the labels that correspond to the given task
  # type(y_train) = type(y_val) = pandas.DataFrame
  y_train = y_train[task]
  y_val   = y_val[task]

  # For multiclass classification we need to transform 
  # the labels to a one-hot-encoding representation
  if task == 'HTA':
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=5)
    y_val = tf.keras.utils.to_categorical(y_val, num_classes=5)     

  model.fit(x=x_train, 
            y=y_train,
            validation_data=(x_val, y_val),
            epochs=max_epochs,
            batch_size=batch_size,
            verbose=verbose)
  
  max_acc = max(model.history.history['val_acc'])

  return max_acc, model.history.history

### getMetricsSummary()

In [None]:
from statistics import mean

def getMetricsSummary(hist_list):
    """
    Returns a summary of the metrics registered in the histories_list.

    inputs:
    hist_list       - list[model.history.history], list of models histories 

    output:
    metrics_summary - dict, keys = [train_acc_A, val_acc_A, train_acc_B, val_acc_B] 

    # Two methosd to evaluate every configuration:

    # Method A:
        1) Compute the mean-val-acc for every epoch
        2) Let E be the epoch with the hihgest mean-val-acc 
        3) Report both the mean-val-acc and mean-train-acc at epoch E

    # Method B:
        1) Let E_k be the epoch with the highest val-acc recorded for fold k
        2) For each fold extract the val_acc and train_acc at epoch E_k
        3) Compute the mean of the extracted-metrics
    """   

    best_epoch, train_acc_A, val_acc_A = method_A_Metrics(hist_list)
    min2best, max2best, train_acc_B, val_acc_B = method_B_Metrics(hist_list)

    return {'best_epochh':best_epoch,
            'train_acc_A':train_acc_A, 
            'val_acc_A':val_acc_A,
            'min2best':min2best,
            'max2best':max2best,
            'train_acc_B':train_acc_B,
            'val_acc_B':val_acc_B
            }

# **************************   computeGlobalMetrics()   **************************

def method_A_Metrics(histories_list):
    # Method A results:
    mean_val_acc  = list()     
    mean_train_acc = list() 

    # Find M (the largest mean-val-acc)
    max_mean_val_acc = 0
    acc_zip = list(zip(*[h['acc'] for h in histories_list]))
    val_acc_zip = list(zip(*[h['val_acc'] for h in histories_list]))

    n_epochs = len(acc_zip)
    best_epoch_idx = 0
    for i in range(n_epochs):
        mean_train_acc.append(mean(acc_zip[i]))
        mean_val_acc.append(mean(val_acc_zip[i]))

        if mean_val_acc[-1] > max_mean_val_acc:
            max_mean_val_acc = mean_val_acc[-1]
            best_epoch_idx = i

    """
    DELTA = 0  
    DELTA = .5/100

    if DELTA > 0:
        # Try to find an epoch with a smaller stdev
        # Use M to pick the best epoch
        best_epoch_val_stdev = 1
        best_epoch_idx = 0

        for i in range(n_epochs):
            val_acc = mean_val_acc[i]
            val_stdev = val_acc_stdev[i]

            if (val_acc >= max_val_acc - DELTA) & (val_stdev < best_epoch_val_stdev):
                best_epoch_val_stdev = val_stdev
                best_epoch_idx = i
    """

    return best_epoch_idx, mean_train_acc[best_epoch_idx], mean_val_acc[best_epoch_idx]

def method_B_Metrics(histories_list):
    # Method B results:
    val_acc_results  = []
    train_acc_results = []

    best_epochs_list = []
    for h in histories_list:
        val_acc_history = h['val_acc']
        train_acc_history = h['acc']

        fold_max_val_acc = max(val_acc_history)
        fold_best_epoch = val_acc_history.index(fold_max_val_acc)
        best_epochs_list.append(fold_best_epoch)

        val_acc_results.append(fold_max_val_acc)
        train_acc_results.append(train_acc_history[fold_best_epoch])

    min2best = min(best_epochs_list)
    max2best = max(best_epochs_list)

    return min2best, max2best, mean(train_acc_results), mean(val_acc_results)


# EXPERIMENTS

Ejecutar las siguientes operaciones para cada uno de los diferentes encodings ['FT1', 'FT2', 'FT3', 'W2V300', 'GloVe300']

## Dataset

In [20]:
X_train, Y_train = dataset_utils.loadEncodedTrainData(embedding_type='FT2',
                                                 encoding_format='SINGLE-VEC',
                                                 labels_to_return=['HS'],
                                                 n_folds=7)

FastText 2 - Esp. Wikipedia
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)


## Parameters dictionary

### Architecture params combos

In [16]:
from itertools import product

arch_params_dict = {
    'L1_size': [300, 200],                    
    'L2_size': [150, 100, 75],                
    'activation':['relu'],                                    
    'p_dropout_1':[0, 0.5],
    'p_dropout_2':[0, 0.5]                                
}

arch_params_combos = list(product( *arch_params_dict.values() ))          

arch_params_combos = [params_combo for params_combo in arch_params_combos if params_combo[-1]==params_combo[-2]]
print('{} arch_params_combos.'.format(len(arch_params_combos)))   

12 arch_params_combos.


### Training params combinations

In [17]:
trainig_params_dict = {'optimizer':['adam-1e-3', 'adam-5e-4', 'adam-1e-4', 'rmsprop-1e-3', 'rmsprop-5e-4', 'rmsprop-1e-4'],
                       'batch_size':[256,512,1024],
                       'max_epochs':[75]}
                        

trainig_params_combos = list(product( *trainig_params_dict.values() ))

print('\n{} trainig_params_combos.'.format(len(trainig_params_combos))) 


18 trainig_params_combos.


In [18]:
search_space = list(product(arch_params_combos, trainig_params_combos))

print('\n{} params combinations in the search_space.'.format(len(search_space))) 


216 params combinations in the search_space.


## Escaneo

In [21]:
results_df = nn_utils.spaceScanner(X_train, Y_train,
                          'HS',
                          fcNeuralNetworkModel, 
                          search_space[:],
                          arch_params_dict.keys(),
                          trainig_params_dict.keys(),
                          fraction2eval=1,
                          n_folds=7,
                          fitting_attemps=1,
                          stop_threshold=1,
                          partial_CV=True,
                          backup_file='./Results/SNN/FT2_embeddings_experiments.df',
                          backup_freq=25,
                          save_history_files=False,
                          save_models_as_json=False)

                          # CNN_HTA_EXT_DATA_NEW_EXPERIMENTS

BackupFile: ./Results/SNN/FT2_embeddings_experiments.df
SCANNING SEARCH SPACE

216 configurations will be evaluated.
1   - conf_ID: EoZPXDti   ****  --  train_acc_A = 0.761,  val_acc_A = 0.762    --  train_acc_B = 0.758,  val_acc_B = 0.771   
2   - conf_ID: mKramAEq   ****  --  train_acc_A = 0.754,  val_acc_A = 0.762    --  train_acc_B = 0.767,  val_acc_B = 0.769   
3   - conf_ID: wiHBUVLB   ****  --  train_acc_A = 0.728,  val_acc_A = 0.756    --  train_acc_B = 0.765,  val_acc_B = 0.764   
4   - conf_ID: uponZUmR   ****  --  train_acc_A = 0.745,  val_acc_A = 0.757    --  train_acc_B = 0.772,  val_acc_B = 0.767   
5   - conf_ID: rKTpphsx   ****  --  train_acc_A = 0.75,   val_acc_A = 0.76     --  train_acc_B = 0.754,  val_acc_B = 0.769   
6   - conf_ID: XcHWGfWG   ****  --  train_acc_A = 0.793,  val_acc_A = 0.757    --  train_acc_B = 0.773,  val_acc_B = 0.763   
7   - conf_ID: phgNeJPE   ****  --  train_acc_A = 0.823,  val_acc_A = 0.756    --  train_acc_B = 0.794,  val_acc_B = 0.761   
8

In [None]:
results_df

Unnamed: 0,conf_ID,L1_size,L2_size,activation,p_dropout_1,p_dropout_2,optimizer,batch_size,max_epochs,best_epochh,train_acc_A,val_acc_A,min2best,max2best,train_acc_B,val_acc_B
0,beYZusxb,300,75,relu,0.5,0.5,adam-5e-4,1024,75,69,0.77705,0.765843,42,60,0.739776,0.777043
1,GpuPeLFz,200,75,relu,0.0,0.0,rmsprop-5e-4,256,75,30,0.865485,0.759539,15,33,0.826923,0.772138


# NN TRAINING A GLOBAL CLASSIFIER

## Dataset

In [None]:
X_train, Y_train = dataset_utils.loadEncodedTrainData(embedding_type='FT3',
                                                 encoding_format='SINGLE-VEC',
                                                 labels_to_return=['HS'],
                                                 n_folds=7)

FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)


In [None]:
import numpy as np

In [None]:
X_train_merged = np.concatenate([X_train[i] for i in range(7)], axis=0)

In [None]:
Y_train_labels = Y_train['HS']

## GETTING THE KERAS MODEL

In [None]:
arch_params= {
    'L1_size': 200,                    
    'L2_size': 75,                
    'activation':'relu',                                    
    'p_dropout_1': 0.5,
    'p_dropout_2': 0.5                                
}

optimizer = keras_optimizers.Adam(learning_rate=0.0005)

my_keras_model = fcNeuralNetworkModel('HS', arch_params, optimizer, verbose=True)

## FITTING THE KERAS MODEL

In [None]:
my_keras_model.fit(x=X_train_merged, 
            y=Y_train_labels,
            epochs=50,
            batch_size=512,
            verbose=2)

## TESTING

In [None]:
def loadEncodedTestData(embedding_type, encoding_format, labels_to_return):
  """ 
  Loads encoded test dataset from drive.
  
  Input:
  embedding_type   - str, valid_types = ['FT1', 'FT2', 'FT3', 'W2V100', 'W2V300', 'GloVe100', 'GloVe300']
  encoding_format  - str, valid_foramts = ['SINGLE-VEC', 'EMB-SEQ']

  Output:
  (X_test, Y_test) -- tuple containing the encoded dataset

  --
  X_train   - list[numpy_array], encoded train-set partitioned in K subsets
  Y_train   - DataFrame, train set labels
  X_test    - list[numpy_array], encoded test-set
  Y_tes     - DataFrame, test set labels

  """
  valid_embedding_types = ['FT1', 'FT2', 'FT3', 'W2V100', 'W2V300', 'GloVe100', 'GloVe300']
  valid_encoding_format_options = ['SINGLE-VEC', 'EMB-SEQ']

  if embedding_type not in valid_embedding_types:
    print('Invalid embedding_type option. No data was returned.\n')
    print('...')
    print("Valid embedding types: ['FT1', 'FT2', 'FT3', 'W2V100', 'W2V300', 'GloVe100', 'GloVe300']")
    return (None, None)

  if encoding_format not in valid_encoding_format_options:
    print('Invalid format option. No data was returned.\n')
    print('...')
    print("Valid formats: ['SINGLE-VEC', 'EMB-SEQ']")
    return (None, None)

  EMBEDDINGS_INFO = {'FT1':'FastText 1 - Common Crawl + Wikipedia',
                    'FT2':'FastText 2 - Esp. Wikipedia',
                    'FT3':'FastText 3 - Spanish Unannotated Corpora',
                    'W2V300':'W2V 300d - Spanish Unannotated Corpora',
                    'W2V100':'W2V 100d - Spanish CoNLL',
                    'GloVe300':'GloVe 300d - Spanish Billion Word Corpus',
                    'GloVe100':'GloVe 100d - Spanish Billion Word Corpus'}

  embedding_info = EMBEDDINGS_INFO[embedding_type]

  print(embedding_info)
  print('Encoding Format: {}'.format(encoding_format))

  import pandas as pd
  prep_format = 2

  # DATA
  file_name = '{}_TEST_P{}.data'.format(embedding_type, prep_format)
  with open('./dataset_files/Encoded/{}/{}'.format(encoding_format,file_name), 'rb') as filehandle:
    encoded_test_data = pickle.load(filehandle)

  # LABELS
  test_dataset_df = pd.read_pickle('./dataset_files/preprocessed_test_dataset.data', None)
  test_labels = test_dataset_df.loc[:,labels_to_return]

  print('\nProcess complete')
  print('{} test instances retrieved'.format(len(encoded_test_data)))

  # Check encodings dimensions
  encodings_dim = encoded_test_data.shape

  print('\nencodings_dim = {}'.format(encodings_dim[1:]))

  return(encoded_test_data, test_labels)

In [None]:
X_test, Y_test = loadEncodedTestData(embedding_type='FT3',
                                    encoding_format='SINGLE-VEC',
                                    labels_to_return=['HS'])

FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
1600 test instances retrieved

encodings_dim = (300,)


In [None]:
X_test.shape

(1600, 300)

In [None]:
target = Y_test['HS']
predicted  = np.where(my_keras_model.predict(X_test) > 0.5, 1, 0)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_s = accuracy_score(target, predicted)
A_f1 = f1_score(target, predicted, average="macro")

In [None]:
accuracy_s, A_f1

(0.71375, 0.6901002945062158)

# Resources

* Mini batch size selection: [1](https://datascience.stackexchange.com/questions/18414/are-there-any-rules-for-choosing-the-size-of-a-mini-batch), [2](https://www.quora.com/In-deep-learning-why-dont-we-use-the-whole-training-set-to-compute-the-gradient), [3](https://stats.stackexchange.com/questions/164876/what-is-the-trade-off-between-batch-size-and-number-of-iterations-to-train-a-neu)
* Optimizers: [1](https://ai.stackexchange.com/questions/18206/what-kind-of-optimizer-is-suggested-to-use-for-binary-classification-of-similar)
* RMSprop: [1](https://towardsdatascience.com/understanding-rmsprop-faster-neural-network-learning-62e116fcf29a)
* Number of Hidden Layers: [1](https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw)

    
