# Ajustes iniciales

## Conexión a google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd ./drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [3]:
%pwd

'/content/drive/MyDrive/Colab Notebooks'

## Importar modulos *_utils


In [4]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')

from my_utils import eval_utils
from my_utils import nn_utils
from my_utils import dataset_utils
import numpy as np

# Local Utils

## trainFullDatasetClassifiers

In [5]:
import numpy as np
import tensorflow as tf
import pickle
from pandas import DataFrame
from keras.models import model_from_json


def trainFullDatasetClassifiers(task, search_results_path, models_architecture):

  ## get the search_results
  search_results = loadSearchResults(search_results_path)

  ## get the conf_ids for the top results
  conf_ids = getConfigIds(search_results)

  search_results.set_index('conf_ID', inplace=True)

  ## get X_train & Y_train data
  print('Loading train data...')
  X_train, Y_train = getTrainData(task, models_architecture)
  print()

  ## Train the classifiers
  for conf_id in conf_ids:  
    config_data = search_results.loc[conf_id]

    # unpack the configuration data
    best_epoch = config_data['best_epochh']
    median2best = config_data['median2best']
  
    optimizer_id = config_data['optimizer']
    max_epochs  = config_data['max_epochs']
    batch_size = config_data['batch_size']

    ## load and compile the neural network model
    model = loadAndCompileModel(conf_id, optimizer_id, task)

    print('Training classifier for configuration ', conf_id)
    weights_dir = f'./trained_models/{conf_id}'

    acc, model_history = fitModelOnFullDataset(model, 
                                            task, 
                                            X_train, Y_train, 
                                            batch_size, 
                                            max_epochs,
                                            save_weights=True, 
                                            weights_dir=weights_dir,
                                            epochs2watch = [min(best_epoch,max_epochs-1), min(median2best,max_epochs-1)],
                                            verbose=0)

    print(acc, '\n')                                            
    
    # save the full dataset classifier
    with open(f'./history_files/{conf_id}-full-dataset-model.dict', 'wb') as file_handler:
      pickle.dump([model_history], file_handler)

### trainClassifiers utils

In [6]:
def getTrainData(task, architecture):
  if architecture == 'SNN':
    encoding_format = 'SINGLE-VEC'
  else:
    encoding_format = 'EMB-SEQ'

  train_data, train_labels = dataset_utils.loadEncodedTrainData(embedding_type='FT3',
                                                 encoding_format=encoding_format,
                                                 labels_to_return=[task],
                                                 n_folds=7)

  X_train = np.concatenate([train_data[i] for i in range(7)], axis=0)
  Y_train = train_labels[task]

  return X_train, Y_train
  
def loadSearchResults(search_results_path):
  with open(search_results_path, 'rb') as file_handler:
    search_results = pickle.load(file_handler)

  return search_results

def getConfigIds(search_results):
  search_results.sort_values(by='val_acc_A', ascending=False, inplace=True)
  conf_ids = search_results.conf_ID[:5].to_list()
  search_results.sort_values(by='val_acc_B', ascending=False, inplace=True)
  conf_ids.extend(search_results.conf_ID[:5].to_list())
  conf_ids = list(set(conf_ids))

  return conf_ids

def loadAndCompileModel(conf_id, optimizer_id, task):
  # load json and create model
  # we use the models savesd on search experiments
  json_file = open(f'./models_json_files/{conf_id}.json', 'r')
  model_config = json_file.read()
  json_file.close()
  model = model_from_json(model_config)

  # instantiate the corresponding optimizer
  optimizer = optimizers_list[optimizer_id]

  if task=='HTA':
      model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["acc"])
  else:
      model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["acc"])

  return model


### fitModelOnFullDataset()

In [7]:
# **************************   fitNeuralNetworkModel()   **************************

from os import path, makedirs
from  tensorflow.keras.utils import to_categorical

def fitModelOnFullDataset(model, task, X_train, Y_train, batch_size, max_epochs, 
                      save_weights=False, weights_dir='', epochs2watch=[], verbose=0):
  """
  Fits a precompiled keras model to the given data. Saves weights at every apoch
  in epochs2watch.

  inputs:
  model           - precompiled keras model
  task            - str, ['HS','TR', 'AG', 'HTA']
  data            - tuple, (X_train, Y_ytain)
  batch_size      - int 
  max_epochs      - int
  save_weights    - boolean
  weights_dir     - str, filepath to the weights directory
  
  --
  X_train         - numpy_array [shape = N_t, ENCODDING_DIM]
  Y_train         - numpy_array [shape = N_t, 1]
  
  outputs:
  max_acc         - float, max_val_acc registered in model_history (as defined in method A)
  model_history   - dict, model.history.history

  """

  # Callbacks
  if save_weights:
    if not path.exists(weights_dir):
      makedirs(weights_dir)

    # instantiate the custome saver callback
    custom_saver_callback = CustomSaver(weights_dir_path=weights_dir,
                                        epochs2watch=epochs2watch)
    
    callbacks_list = [custom_saver_callback]
      
  else:
    callbacks_list = None

  # For  the 5-classes classification we need to transform 
  # the labels to a one-hot-encoding representation
  if task == 'HTA':
    Y_train = to_categorical(Y_train, num_classes=5)

  model.fit(x=X_train, 
            y=Y_train,
            epochs=max_epochs,
            batch_size=batch_size,
            verbose=verbose,
            callbacks=callbacks_list)
  
  max_acc = max(model.history.history['acc'])

  return max_acc, model.history.history


### customSaver()

In [8]:
# **************************   CustomSaver()   **************************
from keras.callbacks import Callback

class CustomSaver(Callback):
  """
  Saves the model weights at every epoch in epochs2watch.

  """
  
  def __init__(self, weights_dir_path, epochs2watch):
    super(CustomSaver, self).__init__()
    self.weights_dir=weights_dir_path
    self.epochs2watch=epochs2watch

  def on_epoch_end(self, epoch, logs=None):
    if self.epochs2watch:
      if epoch==self.epochs2watch[0]:
        # This ar the weights for the method-A model
        filename = f'global_model_A.hdf5'
        filepath = f'{self.weights_dir}/{filename}'
        self.model.save(filepath,overwrite=True,include_optimizer=False)
      
      if epoch==self.epochs2watch[1]:
        # This are the weights for the method-B model
        filename = f'global_model_B.hdf5'
        filepath = f'{self.weights_dir}/{filename}'
        self.model.save(filepath,overwrite=True,include_optimizer=False)
    else:
      # we ned to keep the weights for every epoch
      filename = f'e{epoch}.hdf5'
      filepath = f'{self.weights_dir}/{filename}'
      self.model.save(filepath,overwrite=True,include_optimizer=False)



### optimizadores


In [9]:
import tensorflow.keras.optimizers as keras_optimizers

optimizers_list = {'adam-1e-3':keras_optimizers.Adam(learning_rate=0.001),
              'adam-7e-4':keras_optimizers.Adam(learning_rate=0.0007),
              'adam-5e-4':keras_optimizers.Adam(learning_rate=0.0005),
              'adam-3e-4':keras_optimizers.Adam(learning_rate=0.0003),
              'adam-1e-4':keras_optimizers.Adam(learning_rate=0.0001),
              'rmsprop-1e-3':keras_optimizers.RMSprop(learning_rate=0.001, momentum=0.0),
              'rmsprop-7e-4':keras_optimizers.RMSprop(learning_rate=0.0007, momentum=0.0),
              'rmsprop-5e-4':keras_optimizers.RMSprop(learning_rate=0.0005, momentum=0.0),
              'rmsprop-3e-4':keras_optimizers.RMSprop(learning_rate=0.0003, momentum=0.0),
              'rmsprop-1e-4':keras_optimizers.RMSprop(learning_rate=0.0001, momentum=0.0),
              'rmsprop-7.5e-5':keras_optimizers.RMSprop(learning_rate=0.000075, momentum=0.0),
              'rmsprop-5e-5':keras_optimizers.RMSprop(learning_rate=0.00005, momentum=0.0),
              'rmsprop-1e-3-mu0.9':keras_optimizers.RMSprop(learning_rate=0.001, momentum=0.9),
              'rmsprop-7e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0007, momentum=0.9),
              'rmsprop-5e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0005, momentum=0.9),
              'rmsprop-3e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0003, momentum=0.9),
              'rmsprop-1e-4-mu0.9':keras_optimizers.RMSprop(learning_rate=0.0001, momentum=0.9),
              'rmsprop-7.5e-5-mu0.9':keras_optimizers.RMSprop(learning_rate=0.000075, momentum=0.9),
              'rmsprop-5e-5-mu0.9':keras_optimizers.RMSprop(learning_rate=0.00005, momentum=0.9)} 

# SNN 

### HS

In [None]:
trainFullDatasetClassifiers('HS', 
                            search_results_path = './Results/SNN/HS/experiments_4.df',
                            models_architecture = 'SNN')

### TR

In [None]:
trainFullDatasetClassifiers('TR', 
                            search_results_path = './Results/SNN/TR/experiments_4.df',
                            models_architecture = 'SNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)

Training classifier for configuration  oWZBcQgF
0.9340000152587891 

Training classifier for configuration  wppcsrma
0.9366000294685364 

Training classifier for configuration  nypkmrjW
0.9082000255584717 

Training classifier for configuration  jawgqxcm
0.9192000031471252 

Training classifier for configuration  jsiehraH
0.8876000046730042 

Training classifier for configuration  BgILeXhk
0.975600004196167 

Training classifier for configuration  JVEIHntQ
0.8960000276565552 



### AG

In [None]:
trainFullDatasetClassifiers('AG', 
                            search_results_path = './Results/SNN/AG/experiments_4.df',
                            models_architecture = 'SNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)

Training classifier for configuration  ZBIvVzOI
0.9314000010490417 

Training classifier for configuration  FljeuYPX
0.9121999740600586 

Training classifier for configuration  KjzkpEMA
0.9330000281333923 

Training classifier for configuration  kpsolZbm
0.9031999707221985 

Training classifier for configuration  tNmCbkjv
0.9473999738693237 

Training classifier for configuration  WSUJpwQP
0.9002000093460083 

Training classifier for configuration  yCKbOmtc
0.9498000144958496 

Training classifier for configuration  MAIRzcUI
0.9431999921798706 

Training classifier for configuration  ctvTiuoq
0.9603999853134155 



### HTA

In [None]:
trainFullDatasetClassifiers('HTA', 
                            search_results_path = './Results/SNN/HTA/experiments_4f.df',
                            models_architecture = 'SNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)

Training classifier for configuration  wScrhgQv
0.8417999744415283 

Training classifier for configuration  KQeFiZqG
0.7387999892234802 

Training classifier for configuration  EJYjTOAq
0.6561999917030334 

Training classifier for configuration  pCqsruJR
0.7585999965667725 

Training classifier for configuration  JzHKJxKW
0.7544000148773193 

Training classifier for configuration  gKzBjYBj
0.7767999768257141 

Training classifier for configuration  pJEUGKCU
0.7947999835014343 

Training classifier for configuration  MYHWkrFV
0.7080000042915344 

Training classifier for configuration  nVhBMnAy
0.7188000082969666 



# CNN 

### HS

In [None]:
trainFullDatasetClassifiers('HS', 
                            search_results_path = './Results/CNN/HS/experiments_6.df',
                            models_architecture = 'CNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  ktGXNLsV
0.9908000230789185 

Training classifier for configuration  nFehdAwZ
0.979200005531311 

Training classifier for configuration  lPHcKwYL
0.991599977016449 

Training classifier for configuration  LjUTRRKG
0.9941999912261963 

Training classifier for configuration  DmScKBFk
0.9768000245094299 

Training classifier for configuration  eeitJyKy
0.9911999702453613 

Training classifier for configuration  jLggJPxo
0.9861999750137329 

Training classifier for configuration  MUQKiuDm
0.925000011920929 

Training classifier for configuration  zKWmVuqz
0.9258000254631042 



### TR

In [None]:
trainFullDatasetClassifiers('TR', 
                            search_results_path = './Results/CNN/TR/experiments_6.df',
                            models_architecture = 'CNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  XKWyJjCp
0.8238000273704529 

Training classifier for configuration  wrfidhVN
0.9613999724388123 

Training classifier for configuration  oRGmNoZx
0.993399977684021 

Training classifier for configuration  TfFFXlkd
0.9265999794006348 

Training classifier for configuration  pTXwsqCt
0.9944000244140625 

Training classifier for configuration  sWquCUHY
0.9606000185012817 

Training classifier for configuration  oziyYwlg
0.9919999837875366 



### AG

In [None]:
trainFullDatasetClassifiers('AG', 
                            search_results_path = './Results/CNN/AG/experiments_6.df',
                            models_architecture = 'CNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  heguIhkD
0.824999988079071 

Training classifier for configuration  AafqTJRJ
0.9945999979972839 

Training classifier for configuration  lYkXiQZd
0.993399977684021 

Training classifier for configuration  lUQlOGae
0.9887999892234802 

Training classifier for configuration  YAsdTTEC
0.9941999912261963 

Training classifier for configuration  QGsgmPVe
0.9837999939918518 

Training classifier for configuration  DztMWiJK


### HTA

In [None]:
trainFullDatasetClassifiers('HTA', 
                            search_results_path = './Results/SNN/HTA/experiments_4f.df',
                            models_architecture = 'SNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
5000 train instances retrieved

encodings_dim = (300,)

Training classifier for configuration  wScrhgQv
0.8417999744415283 

Training classifier for configuration  KQeFiZqG
0.7387999892234802 

Training classifier for configuration  EJYjTOAq
0.6561999917030334 

Training classifier for configuration  pCqsruJR
0.7585999965667725 

Training classifier for configuration  JzHKJxKW
0.7544000148773193 

Training classifier for configuration  gKzBjYBj
0.7767999768257141 

Training classifier for configuration  pJEUGKCU
0.7947999835014343 

Training classifier for configuration  MYHWkrFV
0.7080000042915344 

Training classifier for configuration  nVhBMnAy
0.7188000082969666 



# BiLSTM

### HS

In [None]:
trainFullDatasetClassifiers('HS', 
                            search_results_path = './Results/CNN/HS/experiments_6.df',
                            models_architecture = 'BiLSTM')

### AG

In [12]:
trainFullDatasetClassifiers('AG', 
                            search_results_path = './Results/BiLSTM/AG/experiments_4f.df',
                            models_architecture = 'BiLSTM')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  VXXZXjPY
0.9138000011444092 

Training classifier for configuration  Afxwglds
0.954800009727478 

Training classifier for configuration  tOYuufHH
0.9154000282287598 

Training classifier for configuration  haUlGPxu
0.9204000234603882 

Training classifier for configuration  BUTpSihx
0.9593999981880188 



### TR

In [13]:
trainFullDatasetClassifiers('TR', 
                            search_results_path = './Results/BiLSTM/TR/experiments_4f.df',
                            models_architecture = 'BiLSTM')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  jEHBKuZv
0.9476000070571899 

Training classifier for configuration  oAsiDtqb
0.9476000070571899 

Training classifier for configuration  zDXbNIXN
0.9277999997138977 

Training classifier for configuration  nwPtJmRD
0.9693999886512756 

Training classifier for configuration  UceeXbhf
0.9611999988555908 

Training classifier for configuration  qmWWutWH
0.9678000211715698 

Training classifier for configuration  QXXxwbcD
0.9376000165939331 

Training classifier for configuration  qDdrGDRN
0.9538000226020813 



### HTA

In [14]:
trainFullDatasetClassifiers('HTA', 
                            search_results_path = './Results/BiLSTM/HTA/experiments_4f.df',
                            models_architecture = 'BiLSTM')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  TilpLJKw
0.826200008392334 

Training classifier for configuration  DYyuwEUQ
0.6840000152587891 

Training classifier for configuration  eWdUlyIt
0.878000020980835 

Training classifier for configuration  dxkpOxAe
0.8199999928474426 

Training classifier for configuration  VILAZqpK
0.8047999739646912 

Training classifier for configuration  DBbUMtLW
0.7839999794960022 



# ConvBiLSTM

### HS

In [None]:
trainFullDatasetClassifiers('HS', 
                            search_results_path = './Results/CNN/HS/experiments_6.df',
                            models_architecture = 'CNN')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  ktGXNLsV
0.9908000230789185 

Training classifier for configuration  nFehdAwZ
0.979200005531311 

Training classifier for configuration  lPHcKwYL
0.991599977016449 

Training classifier for configuration  LjUTRRKG
0.9941999912261963 

Training classifier for configuration  DmScKBFk
0.9768000245094299 

Training classifier for configuration  eeitJyKy
0.9911999702453613 

Training classifier for configuration  jLggJPxo
0.9861999750137329 

Training classifier for configuration  MUQKiuDm
0.925000011920929 

Training classifier for configuration  zKWmVuqz
0.9258000254631042 



### AG

In [10]:
trainFullDatasetClassifiers('AG', 
                            search_results_path = './Results/ConvLSTM/AG/experiments_5f.df',
                            models_architecture = 'ConvLSTM')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  KNwuXynu
0.9556000232696533 

Training classifier for configuration  PiKgmstT
0.967199981212616 

Training classifier for configuration  XGYrXoWQ
0.9674000144004822 

Training classifier for configuration  gcVkhcdU
0.965399980545044 

Training classifier for configuration  tRZxWsll
0.9746000170707703 

Training classifier for configuration  UOKoIhha
0.9735999703407288 

Training classifier for configuration  VNZvaMik
0.9771999716758728 



### TR

In [11]:
trainFullDatasetClassifiers('TR', 
                            search_results_path = './Results/ConvLSTM/TR/experiments_5f.df',
                            models_architecture = 'ConvLSTM')

Loading train data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
5000 train instances retrieved

encodings_dim = (55, 300)

Training classifier for configuration  cEozDdue
0.9067999720573425 

Training classifier for configuration  QYWMRwAW
0.9412000179290771 

Training classifier for configuration  dSsSKnVs
0.9297999739646912 

Training classifier for configuration  NIQKIlIr
0.9071999788284302 

Training classifier for configuration  eatwzMbh
0.9348000288009644 

Training classifier for configuration  iFfHteVR
0.9089999794960022 

Training classifier for configuration  iamRRqSI
0.9394000172615051 



### HTA

In [None]:
trainFullDatasetClassifiers('HTA', 
                            search_results_path = './Results/ConvLSTM/HTA/experiments_5f.df',
                            models_architecture = 'ConvLSTM')