# Ajustes iniciales

## Conexión a google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd ./drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [3]:
%pwd

'/content/drive/MyDrive/Colab Notebooks'

## Importar modulos *_utils


In [4]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')

from my_utils import eval_utils
from my_utils import nn_utils
from my_utils import dataset_utils
from my_utils import results_utils
import numpy as np

# SANDBOX

In [69]:
def getBestModelResults(task, architecture, type_list=['G']):
  if task=='HTA':
    sorting_metric='EMR'
  else:
    sorting_metric='f1-macro'

  results_list = [f'./Results/final/{architecture}-1_{task}_TEST.df',
                  f'./Results/final/{architecture}-2_{task}_TEST.df']

  results_df = results_utils.mergeDataFrames(results_list, '.delete_this.df') 
  type_mask = results_df.model_type.isin(type_list)

  return results_df.loc[type_mask]

In [48]:
getBestModelResults('HS', 'CNN', ['G'])

Unnamed: 0,conf_id,model_type,architecture,acc,prec,recall,f1-macro
7,zKWmVuqz,G,CNN-1,0.74875,0.741587,0.745326,0.742967
17,DmScKBFk,G,CNN-1,0.7325,0.724132,0.724726,0.724418
27,ktGXNLsV,G,CNN-1,0.665,0.718876,0.700451,0.662842
37,MUQKiuDm,G,CNN-1,0.73,0.721446,0.721244,0.721344
47,jLggJPxo,G,CNN-1,0.73375,0.726597,0.730529,0.727931
57,LjUTRRKG,G,CNN-2,0.64625,0.719733,0.688104,0.64119
67,eeitJyKy,G,CNN-2,0.72125,0.713651,0.716957,0.714834
77,ktGXNLsV,G,CNN-2,0.67375,0.721611,0.706995,0.672312
87,nFehdAwZ,G,CNN-2,0.679375,0.784912,0.614523,0.584358
97,lPHcKwYL,G,CNN-2,0.74,0.733484,0.73833,0.734896


In [5]:
## Load search results
import pickle

search_results_path = './Results/SNN/HS/experiments_4.df'
with open(search_results_path, 'rb') as file_handler:
  search_results = pickle.load(file_handler)

## evaluate for method-A pondering (global mean)
search_results.sort_values(by='val_acc_A', ascending=False, inplace=True)
config_ids = search_results.conf_ID[:7].to_list()

In [6]:
config_ids

['IMsHTOqQ',
 'vNIrvuRV',
 'ItNgrhRR',
 'IYwHpkvM',
 'gobIDbBH',
 'WDIHfXgz',
 'hqkZZOBk']

In [7]:
search_results.head()

Unnamed: 0,conf_ID,L1_size,L2_size,activation,p_dropout_1,p_dropout_2,optimizer,batch_size,max_epochs,best_epochh,train_acc_A,val_acc_A,min2best,max2best,median2best,train_acc_B,val_acc_B
45,IMsHTOqQ,450,100,relu,0.5,0.5,adam-5e-4,512,75,47,0.779468,0.780797,43,59,47,0.789803,0.788196
0,vNIrvuRV,550,125,relu,0.25,0.25,adam-5e-4,256,75,19,0.779702,0.780598,9,37,18,0.771436,0.789796
28,ItNgrhRR,450,125,relu,0.5,0.25,adam-5e-4,256,75,22,0.7557,0.780597,19,45,39,0.792767,0.791998
8,IYwHpkvM,550,125,relu,0.5,0.5,adam-5e-4,256,75,41,0.8004,0.780593,21,56,29,0.779701,0.788795
44,gobIDbBH,450,100,relu,0.5,0.5,adam-5e-4,256,75,46,0.805334,0.780393,28,53,35,0.775035,0.789393


# Local Utils

## automaticEnsembleEvaluation()

In [28]:
import pickle
from pandas import DataFrame

def automaticEnsembleEvaluation(task, search_results_path, architecture, n_classifiers):
  ## Load test data
  print('Loading test data...')
  X_test, Y_test = loadTestData(task, architecture)
  print()

  ## get the search_results
  search_results = loadSearchResults(search_results_path)
  #search_results.set_index('conf_ID', inplace=True)

  ## evaluate for method-A pondering (global mean)
  print('Evaluating ensembles A')
  
  search_results.sort_values(by='val_acc_A', ascending=False, inplace=True)
  config_ids = search_results.conf_ID[:n_classifiers].to_list()

  results_A = evaluateClassifiersEnsemble(X_test, Y_test, task, config_ids, 'A', 
                                    architecture, n_classifiers, verbose=True)

  results_file = f'./Results/final/ensembles/{architecture}-1_{task}.df'

  with open(results_file, 'wb') as file_handler:
    pickle.dump(results_A, file_handler)

  print()

  ## evaluate for method-B pondering (per-fold-analysys)
  print('Evaluating ensembles B')

  search_results.sort_values(by='val_acc_B', ascending=False, inplace=True)
  config_ids = search_results.conf_ID[:n_classifiers].to_list()

  results_B = evaluateClassifiersEnsemble(X_test, Y_test, task, config_ids, 'B', 
                                    architecture, n_classifiers, verbose=True) 

  results_file = f'./Results/final/ensembles/{architecture}-2_{task}.df'

  with open(results_file, 'wb') as file_handler:
    pickle.dump(results_B, file_handler)

  print()

  return results_A, results_B


In [29]:
def loadTestData(task, architecture):
  if architecture == 'SNN':
    encoding_format = 'SINGLE-VEC'
  else:
    encoding_format = 'EMB-SEQ'

  X_test, Y_test = dataset_utils.loadEncodedTestData(embedding_type='FT3',
                                                 encoding_format=encoding_format,
                                                 labels_to_return = [task])
  
  return X_test, Y_test

In [30]:
def loadSearchResults(search_results_path):
  with open(search_results_path, 'rb') as file_handler:
    search_results = pickle.load(file_handler)

  return search_results
  
def getConfigIds(search_results):
  search_results.sort_values(by='val_acc_A', ascending=False, inplace=True)
  conf_ids = search_results.conf_ID[:5].to_list()
  search_results.sort_values(by='val_acc_B', ascending=False, inplace=True)
  conf_ids.extend(search_results.conf_ID[:5].to_list())
  conf_ids = list(set(conf_ids))

  return conf_ids

### evaluateClassifiersEnsemble()

In [31]:

from pandas import DataFrame
from  tensorflow.keras.utils import to_categorical

def evaluateClassifiersEnsemble(X_test, Y_test, task, config_ids, eval_method, 
                                    arch_label, n_classifiers, verbose=False):                                   

  evaluations_record = list()

  if task=='HTA':
    n_classes = 5
  else:
    n_classes = 2

  # we'll use classes_probs_sum and classes_votes_sum 
  # to generate two different ensembles 
  if task=='HTA':
    classes_probs_sum = np.zeros((len(Y_test),n_classes))
  else:
    classes_probs_sum = np.zeros((len(Y_test),1))

  classes_votes_sum = np.zeros((len(Y_test),n_classes))

  # process samples with the different classiffiers
  for config_ID in config_ids:
    weights_file = f'global_model_{eval_method}.hdf5'
    trained_model = loadPretrainedModel(config_ID, weights_file)

    # make predictions on X_test samples
    classes_probs = trained_model.predict(X_test)

    # turn the classes_probs into classes predictions
    labels_predictions_array = getClassesPredictions(classes_probs, task)
    
    classes_probs_sum += classes_probs
    classes_votes_sum += to_categorical(labels_predictions_array, num_classes=n_classes)
    
  # EVALUATE THE ENSEMBLES
  if verbose:
    print('\nEvaluating MEAN PROBABILITIES ENSEMBLE')

  # turn the accumulated probabilities into classes predictions
  labels_predictions_array = getClassesPredictions(classes_probs_sum/n_classifiers, task)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], labels_predictions_array)
  
  model_results_dict = {'model_type': 'MEAN-PROB',
                        'n_classifiers':n_classifiers,
                        'architecture': arch_label,
                        **evaluation}

  evaluations_record.append(model_results_dict) 

  if verbose:
    print('\nEvaluating MAJORITY VOTING ENSEMBLE')

  # turn the classes votes into an array of classes predictions
  labels_predictions_array = np.array([classes_votes.argmax() for classes_votes in classes_votes_sum]).reshape(-1,1)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], labels_predictions_array)
  
  model_results_dict = {'model_type': 'VOTING',
                        'n_classifiers':n_classifiers,
                        'architecture': arch_label,
                        **evaluation}

  evaluations_record.append(model_results_dict) 

  evaluations_results_df = DataFrame(evaluations_record)

  return evaluations_results_df

### loadPretrainedModel()

In [32]:
#*************************     loadPretrainedModel()    ************************  
from keras.models import model_from_json

def loadPretrainedModel(config_ID, weights_file):
  # load model configuration from json file
  json_file = open(f'./models_json_files/{config_ID}.json', 'r')
  model_config = json_file.read()
  json_file.close()
  trained_model = model_from_json(model_config)

  # load pretrained weights into the model
  trained_model.load_weights(f'./trained_models/{config_ID}/{weights_file}')

  return trained_model


### getClassesPredictions()

In [33]:
import numpy as np

def class_pred(true_prob):
  if true_prob>=0.5:
    return 1
  else:
    return 0

def getClassesPredictions(classes_probs, task):
  if task=='HTA': 
    return np.array([probs.argmax() for probs in classes_probs]).reshape(-1,1)
  else:
    return np.apply_along_axis(class_pred, 1, classes_probs).reshape(-1,1)

### evaluatePredictions()

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluatePredictions(task, val_labels, pred_labels, verbose=False):

  if task=='HTA':

    # get the correspondig predicted and val (target) labels for each task
    pred_HS_labels, pred_TR_labels, pred_AG_labels = dataset_utils.getLabelsPerTask(pred_labels)
    val_HS_labels, val_TR_labels, val_AG_labels = dataset_utils.getLabelsPerTask(val_labels)

    # compute the different metrics
    HS_acc = accuracy_score(val_HS_labels, pred_HS_labels)
    HS_prec = precision_score(val_HS_labels, val_HS_labels, average="macro")
    HS_recall = recall_score(val_HS_labels, val_HS_labels, average="macro")
    HS_f1 = f1_score(val_HS_labels, pred_HS_labels, average="macro")

    AG_acc = accuracy_score(val_AG_labels, pred_AG_labels)
    AG_prec = precision_score(val_AG_labels, pred_AG_labels, average="macro")
    AG_recall = recall_score(val_AG_labels, pred_AG_labels, average="macro")
    AG_f1 = f1_score(val_AG_labels, pred_AG_labels, average="macro")

    TR_acc = accuracy_score(val_TR_labels, pred_TR_labels)
    TR_prec = precision_score(val_TR_labels, pred_TR_labels, average="macro")
    TR_recall = recall_score(val_TR_labels, pred_TR_labels, average="macro")
    TR_f1 = f1_score(val_TR_labels, pred_TR_labels, average="macro")

    F1_multi = (HS_f1+ AG_f1 + TR_f1)/3

    EMR = computeEMR(list(zip(val_HS_labels, val_TR_labels, val_AG_labels)),
                    list(zip(pred_HS_labels, pred_TR_labels, pred_AG_labels)))

    results_dict = {'HS_acc':HS_acc,
                    'HS_prec':HS_prec,
                    'HS_recall':HS_recall,
                    'HS_f1':HS_f1,
                    'AG_acc':AG_acc,
                    'AG_prec':AG_prec,
                    'AG_recall':AG_recall,
                    'AG_f1':AG_f1,
                    'TR_acc':TR_acc,
                    'TR_prec':TR_prec,
                    'TR_recall':TR_recall,
                    'TR_f1':TR_f1,
                    'F1_multi':F1_multi,
                    'EMR':EMR}

    if verbose:
      print('EMR = ', EMR)
      print('F1_multi = ', F1_multi)
      print()

    return results_dict

  if task in ['HS', 'TR', 'AG']:
    # compute the different metrics
    acc = accuracy_score(val_labels, pred_labels)
    prec = precision_score(val_labels, pred_labels, average="macro")
    recall = recall_score(val_labels, pred_labels, average="macro")
    f1_macro = f1_score(val_labels, pred_labels, average="macro")

    results_dict = {'acc':acc,
                    'prec':prec,
                    'recall':recall,
                    'f1-macro':f1_macro}

    if verbose:
      print('Acc = ', acc)
      print('F1_macro = ', f1_macro)
      print()

    return results_dict

def computeEMR(test_labels, pred_labels):
  total_instances = len(test_labels)
  exact_match_count= 0
  for gold, pred in zip(test_labels, pred_labels):
    #print(gold, pred)
    if gold == pred:
      exact_match_count += 1

  return exact_match_count/total_instances

def compute_metrics(target, predicted):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    accuracy = accuracy_score(target, predicted)
    precision = precision_score(target, predicted, average="macro")
    recall = recall_score(target, predicted, average="macro")
    f1 = f1_score(val_labels, pred_labels, average="macro")

    results = {'acc':accuracy_s, 
              'prec' : precision_pos,
              'recall' : precision_neg,  
              'f1': recall_pos,
              'recall_neg' : recall_neg,
              'f1_pos': f1_pos,
              'f1_neg': f1_neg}
    
    return results

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluatePredictions(task, val_labels, pred_labels, verbose=False):

  if task=='HTA':

    # get the correspondig predicted and val (target) labels for each task
    pred_HS_labels, pred_TR_labels, pred_AG_labels = dataset_utils.getLabelsPerTask(pred_labels)
    val_HS_labels, val_TR_labels, val_AG_labels = dataset_utils.getLabelsPerTask(val_labels)

    # compute ACC, PREC, RECALL and F1 metrics
    HS_acc, HS_prec, HS_recall, HS_f1 = compute_metrics(val_HS_labels, pred_HS_labels)
    AG_acc, AG_prec, AG_recall, AG_f1 = compute_metrics(val_AG_labels, pred_AG_labels)   
    TR_acc, TR_prec, TR_recall, TR_f1 = compute_metrics(val_TR_labels, pred_TR_labels)

    F1_multi = (HS_f1+ AG_f1 + TR_f1)/3

    EMR = computeEMR(list(zip(val_HS_labels, val_TR_labels, val_AG_labels)),
                    list(zip(pred_HS_labels, pred_TR_labels, pred_AG_labels)))

    results_dict = {'HS_acc':HS_acc,
                    'HS_prec':HS_prec,
                    'HS_recall':HS_recall,
                    'HS_f1':HS_f1,
                    'AG_acc':AG_acc,
                    'AG_prec':AG_prec,
                    'AG_recall':AG_recall,
                    'AG_f1':AG_f1,
                    'TR_acc':TR_acc,
                    'TR_prec':TR_prec,
                    'TR_recall':TR_recall,
                    'TR_f1':TR_f1,
                    'F1_multi':F1_multi,
                    'EMR':EMR}

    if verbose:
      print('EMR = ', EMR)
      print('F1_multi = ', F1_multi)
      print()

    return results_dict

  if task in ['HS', 'TR', 'AG']:
    # compute ACC, PREC, RECALL and F1 metrics
    acc, prec, recall, f1_macro = compute_metrics(val_labels, pred_labels)

    results_dict = {'acc':acc,
                    'prec':prec,
                    'recall':recall,
                    'f1-macro':f1_macro}

    if verbose:
      print('Acc = ', acc)
      print('F1_macro = ', f1_macro)
      print()

    return results_dict

def compute_metrics(target, predicted):
    accuracy = accuracy_score(target, predicted)
    precision = precision_score(target, predicted, average="macro")
    recall = recall_score(target, predicted, average="macro")
    f1 = f1_score(target, predicted, average="macro")
    
    return accuracy, precision, recall, f1    

def computeEMR(test_labels, pred_labels):
  total_instances = len(test_labels)
  exact_match_count= 0
  for gold, pred in zip(test_labels, pred_labels):
    #print(gold, pred)
    if gold == pred:
      exact_match_count += 1

  return exact_match_count/total_instances

### labels_utils

In [36]:
#**************************     getLabelsPerTask()    **************************
def getLabelsPerTask(HTA_labels):

    HS_labels = list()
    TR_labels = list()
    AG_labels = list()

    for HTA_label in HTA_labels:
        HS_label, TR_label, AG_label = mapTo3DimsFormat(HTA_label)

        HS_labels.append(HS_label)
        TR_labels.append(TR_label)
        AG_labels.append(AG_label)

    HS_labels = np.array(HS_labels).reshape(-1,1)
    TR_labels = np.array(TR_labels).reshape(-1,1)
    AG_labels = np.array(AG_labels).reshape(-1,1)

    return (HS_labels, TR_labels, AG_labels)

#**************************     mapTo3DimsFormat()    ************************** 
def mapTo3DimsFormat(AB_label):
  '''
  Maps label in five_classes_format to 3 dims labeling.

    0 -> (0,0,0)  [HT = 0, TR = 0, AG = 0]
    1 -> (1,0,0)  [HT = 1, TR = 0, AG = 0]
    2 -> (1,0,1)  [HT = 1, TR = 0, AG = 1]
    3 -> (1,1,0)  [HT = 1, TR = 1, AG = 0]
    4 -> (1,1,1)  [HT = 1, TR = 1, AG = 1]

  inpunt:
  label    - int, label in five_classes_format

  output:
  (H,T,A)  - ints tuple, labeling in 3 dims format

  '''
  if AB_label == 0:
    return(0,0,0)

  elif AB_label == 1:
    return(1,0,0)

  elif AB_label == 2:
    return(1,0,1)

  elif AB_label == 3:
    return(1,1,0)

  elif AB_label == 4:
    return(1,1,1)


# SNN (COMPLETE)

## HS

In [91]:
res_A, res_B = automaticEnsembleEvaluation('HS', 
                            './Results/SNN/HS/experiments_4.df', 
                            'SNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
1600 test instances retrieved

encodings_dim = (300,)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## AG

In [31]:
res_A, res_B = automaticEnsembleEvaluation('AG', 
                            './Results/SNN/AG/experiments_4.df', 
                            'SNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
1600 test instances retrieved

encodings_dim = (300,)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## TR

In [32]:
res_A, res_B = automaticEnsembleEvaluation('TR', 
                            './Results/SNN/AG/experiments_4.df', 
                            'SNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
1600 test instances retrieved

encodings_dim = (300,)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## HTA

In [33]:
res_A, res_B = automaticEnsembleEvaluation('HTA', 
                            './Results/SNN/HTA/experiments_4f.df', 
                            'SNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: SINGLE-VEC

Process complete
1600 test instances retrieved

encodings_dim = (300,)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



# CNN (COMPLETE)

## HS

In [27]:
res_A, res_B = automaticEnsembleEvaluation('HS', 
                            './Results/CNN/HS/experiments_7f.df', 
                            'CNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## AG

In [28]:
res_A, res_B = automaticEnsembleEvaluation('AG', 
                            './Results/CNN/AG/experiments_6f.df', 
                            'CNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## TR

In [29]:
res_A, res_B = automaticEnsembleEvaluation('TR', 
                            './Results/CNN/TR/experiments_6f.df', 
                            'CNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## HTA

In [30]:
res_A, res_B = automaticEnsembleEvaluation('HTA', 
                            './Results/CNN/HTA/experiments_6f.df', 
                            'CNN', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



# BiLSTM (COMPLETE)

## HS

In [36]:
res_A, res_B = automaticEnsembleEvaluation('HS', 
                            './Results/BiLSTM/HS/experiments_4f.df', 
                            'BiLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## AG

In [37]:
res_A, res_B = automaticEnsembleEvaluation('AG', 
                            './Results/BiLSTM/AG/experiments_4f.df', 
                            'BiLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## TR

In [38]:
res_A, res_B = automaticEnsembleEvaluation('TR', 
                            './Results/BiLSTM/TR/experiments_4f.df', 
                            'BiLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## HTA

In [39]:
res_A, res_B = automaticEnsembleEvaluation('HTA', 
                            './Results/BiLSTM/HTA/experiments_4f.df', 
                            'BiLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



# ConvLSTM

## HS

In [41]:
res_A, res_B = automaticEnsembleEvaluation('HS', 
                            './Results/ConvLSTM/HS/experiments_5f.df',
                            'ConvLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## AG

In [42]:
res_A, res_B = automaticEnsembleEvaluation('AG', 
                            './Results/ConvLSTM/AG/experiments_5f.df',
                            'ConvLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## TR

In [43]:
res_A, res_B = automaticEnsembleEvaluation('TR', 
                            './Results/ConvLSTM/TR/experiments_5f.df',
                            'ConvLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



## HTA

In [44]:
res_A, res_B = automaticEnsembleEvaluation('HTA', 
                            './Results/ConvLSTM/HTA/experiments_5f.df',
                            'ConvLSTM', 
                            n_classifiers=7)

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating ensembles A

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE

Evaluating ensembles B

Evaluating MEAN PROBABILITIES ENSEMBLE

Evaluating MAJORITY VOTING ENSEMBLE



In [83]:
res_A

Unnamed: 0,model_type,n_classifiers,architecture,HS_acc,HS_prec,HS_recall,HS_f1,AG_acc,AG_prec,AG_recall,AG_f1,TR_acc,TR_prec,TR_recall,TR_f1,F1_multi,EMR
0,MEAN-PROB,5,ConvLSTM,0.760625,0.753751,0.748662,0.750716,0.753125,0.71792,0.747639,0.725327,0.860625,0.825313,0.808343,0.816142,0.764062,0.68625
1,VOTING,5,ConvLSTM,0.759375,0.752143,0.748501,0.750053,0.753125,0.718474,0.748861,0.72581,0.861875,0.828422,0.806921,0.816595,0.764153,0.685


In [84]:
res_B

Unnamed: 0,model_type,n_classifiers,architecture,HS_acc,HS_prec,HS_recall,HS_f1,AG_acc,AG_prec,AG_recall,AG_f1,TR_acc,TR_prec,TR_recall,TR_f1,F1_multi,EMR
0,MEAN-PROB,5,ConvLSTM,0.77,0.764788,0.755738,0.758988,0.780625,0.741967,0.765956,0.750372,0.87,0.84199,0.813201,0.825792,0.778384,0.70125
1,VOTING,5,ConvLSTM,0.765625,0.758443,0.756077,0.757144,0.77375,0.737667,0.767179,0.746335,0.865,0.829741,0.817374,0.823197,0.775558,0.693125


In [90]:
getBestModelResults('HTA', 'ConvLSTM', ['G'])

Unnamed: 0,conf_id,model_type,architecture,HS_acc,HS_prec,HS_recall,HS_f1,AG_acc,AG_prec,AG_recall,AG_f1,TR_acc,TR_prec,TR_recall,TR_f1,F1_multi,EMR
7,OMbpCcoK,G,ConvLSTM-1,0.738125,0.729957,0.726128,0.727704,0.7375,0.69956,0.724933,0.706232,0.86125,0.843211,0.783022,0.805611,0.746516,0.664375
17,plRnsaUy,G,ConvLSTM-1,0.751875,0.744665,0.748211,0.746017,0.73,0.704761,0.740371,0.707693,0.86,0.821942,0.813976,0.817803,0.757171,0.669375
27,VSHFapYo,G,ConvLSTM-1,0.74625,0.738947,0.732592,0.734976,0.75375,0.712981,0.734646,0.720206,0.844375,0.803798,0.78594,0.794036,0.749739,0.671875
37,SgTMMCRk,G,ConvLSTM-1,0.74875,0.740917,0.742166,0.741497,0.738125,0.705127,0.73576,0.711144,0.845625,0.807612,0.782246,0.793303,0.748648,0.663125
47,elVbjZow,G,ConvLSTM-1,0.746875,0.741868,0.748243,0.742979,0.7275,0.709627,0.748979,0.708943,0.8375,0.789834,0.819128,0.801678,0.7512,0.65875
57,plRnsaUy,G,ConvLSTM-2,0.755625,0.750409,0.756818,0.751707,0.76,0.730184,0.766572,0.73691,0.853125,0.809122,0.822176,0.815177,0.767931,0.67125
67,VSHFapYo,G,ConvLSTM-2,0.750625,0.743281,0.738121,0.740167,0.764375,0.722514,0.740973,0.729431,0.851875,0.815664,0.791794,0.802335,0.757311,0.675625
77,OMbpCcoK,G,ConvLSTM-2,0.75375,0.746277,0.742585,0.744145,0.753125,0.718474,0.748861,0.72581,0.866875,0.835699,0.812591,0.822942,0.764299,0.683125
87,elVbjZow,G,ConvLSTM-2,0.740625,0.737766,0.744955,0.737699,0.72875,0.707677,0.745591,0.708676,0.8425,0.795984,0.808896,0.801948,0.749441,0.648125
97,ExiNtXSD,G,ConvLSTM-2,0.7625,0.754994,0.755448,0.755216,0.76625,0.729197,0.756964,0.73744,0.855,0.816117,0.805277,0.810402,0.767686,0.6825


# Resultados

## UTILS

In [70]:
from pandas import DataFrame

In [72]:
def getEnsemblesResultsSummary(task, ensemble_type):
  merged_results = DataFrame()
    
  for architecture in ['SNN','CNN','BiLSTM', 'ConvLSTM']:
    architecture_result = getEnsemblesResults(task, architecture, ensemble_type)
    merged_results=merged_results.append(architecture_result, ignore_index=True)

  return merged_results

In [73]:
def getEnsemblesResults(task, architecture, ensemble_type):
  if task=='HTA':
    sorting_metric='EMR'
  else:
    sorting_metric='f1-macro'

  if ensemble_type==1:
    results_list = [f'./Results/final/{architecture}-1_{task}_TEST.df',
                    f'./Results/final/{architecture}-2_{task}_TEST.df']

    results_df = results_utils.mergeDataFrames(results_list, '.delete_this.df')
    type_mask = results_df.model_type.isin(['E1','E2'])
    results_subset = results_df.loc[type_mask]

    return results_subset.sort_values(by=sorting_metric, ascending=False).iloc[0:1]

  if ensemble_type==2:

    results_list = [f'./Results/final/ensembles/{architecture}-1_{task}.df',
                    f'./Results/final/ensembles/{architecture}-2_{task}.df']

    results_df = results_utils.mergeDataFrames(results_list, '.delete_this.df') 

    return results_df.sort_values(by=sorting_metric, ascending=False).iloc[0:1]

## Ensambles de configuración única

In [75]:
getEnsemblesResultsSummary('HS',1)

Unnamed: 0,conf_id,model_type,architecture,acc,prec,recall,f1-macro
0,IMsHTOqQ,E2,SNN-1,0.7325,0.724161,0.719536,0.721354
1,DmScKBFk,E2,CNN-1,0.74125,0.736283,0.742553,0.737308
2,cwHZKRDu,E1,BiLSTM-2,0.744375,0.736278,0.735735,0.735999
3,gWDhCaka,E1,ConvLSTM-2,0.744375,0.73664,0.738894,0.737605


In [77]:
getEnsemblesResultsSummary('AG',1)

Unnamed: 0,conf_id,model_type,architecture,acc,prec,recall,f1-macro
0,yCKbOmtc,E2,SNN-2,0.789375,0.749492,0.726973,0.736079
1,lYkXiQZd,E2,CNN-2,0.826875,0.800489,0.768886,0.78152
2,Afxwglds,E1,BiLSTM-2,0.799375,0.764286,0.734078,0.745714
3,VNZvaMik,E2,ConvLSTM-2,0.8,0.760613,0.774224,0.766505


In [78]:
getEnsemblesResultsSummary('TR',1)

Unnamed: 0,conf_id,model_type,architecture,acc,prec,recall,f1-macro
0,wppcsrma,E2,SNN-2,0.84875,0.849021,0.74348,0.774527
1,oRGmNoZx,E2,CNN-2,0.86875,0.848793,0.799478,0.819137
2,QXXxwbcD,E1,BiLSTM-2,0.863125,0.865674,0.768395,0.799565
3,cEozDdue,E2,ConvLSTM-2,0.869375,0.840886,0.812776,0.825102


In [87]:
getEnsemblesResultsSummary('HTA',1).iloc[:,:3]

Unnamed: 0,conf_id,model_type,architecture
0,MYHWkrFV,E1,SNN-2
1,QXhDykIQ,E1,CNN-1
2,TilpLJKw,E1,BiLSTM-1
3,OMbpCcoK,E1,ConvLSTM-2


In [85]:
getEnsemblesResultsSummary('HTA',1).iloc[:,-2:]

Unnamed: 0,F1_multi,EMR
0,0.744945,0.69875
1,0.770453,0.705625
2,0.756923,0.7025
3,0.777656,0.715625


## Ensambles multiconfiguración

In [88]:
getEnsemblesResultsSummary('HS',2)

Unnamed: 0,model_type,n_classifiers,architecture,acc,prec,recall,f1-macro
0,VOTING,7,SNN,0.72625,0.71749,0.716473,0.716951
1,VOTING,7,CNN,0.74125,0.73322,0.734429,0.733781
2,VOTING,7,BiLSTM,0.721875,0.712946,0.711847,0.71236
3,VOTING,7,ConvLSTM,0.738125,0.738235,0.745761,0.736134


In [90]:
getEnsemblesResultsSummary('AG', 2)

Unnamed: 0,model_type,n_classifiers,architecture,acc,prec,recall,f1-macro
0,VOTING,7,SNN,0.7925,0.761951,0.710258,0.726496
1,MEAN-PROB,7,CNN,0.814375,0.789848,0.742903,0.759365
2,VOTING,7,BiLSTM,0.786875,0.750248,0.710538,0.724142
3,MEAN-PROB,7,ConvLSTM,0.800625,0.76085,0.766728,0.763628


In [91]:
getEnsemblesResultsSummary('TR', 2)

Unnamed: 0,model_type,n_classifiers,architecture,acc,prec,recall,f1-macro
0,VOTING,7,SNN,0.790625,0.731769,0.699429,0.71174
1,MEAN-PROB,7,CNN,0.868125,0.858177,0.787695,0.813306
2,VOTING,7,BiLSTM,0.85125,0.850371,0.748965,0.779662
3,VOTING,7,ConvLSTM,0.865,0.836782,0.803744,0.817856


In [92]:
getEnsemblesResultsSummary('HTA', 2)

Unnamed: 0,model_type,n_classifiers,architecture,HS_acc,HS_prec,HS_recall,HS_f1,AG_acc,AG_prec,AG_recall,AG_f1,TR_acc,TR_prec,TR_recall,TR_f1,F1_multi,EMR
0,VOTING,7,SNN,0.721875,0.764201,0.674839,0.67292,0.778125,0.743729,0.686608,0.702423,0.836875,0.867884,0.705119,0.739254,0.704866,0.689375
1,MEAN-PROB,7,CNN,0.75625,0.791858,0.716505,0.721665,0.81125,0.784799,0.740072,0.75588,0.855625,0.854558,0.757239,0.787705,0.755084,0.718125
2,MEAN-PROB,7,BiLSTM,0.743125,0.753963,0.710977,0.716179,0.778125,0.73395,0.720202,0.726158,0.856875,0.856773,0.758846,0.789543,0.74396,0.694375
3,MEAN-PROB,7,ConvLSTM,0.764375,0.758504,0.7505,0.753437,0.765625,0.727504,0.753466,0.735622,0.864375,0.834751,0.804834,0.817792,0.76895,0.6925


In [93]:
getEnsemblesResultsSummary('HTA',2).iloc[:,:3]

Unnamed: 0,model_type,n_classifiers,architecture
0,VOTING,7,SNN
1,MEAN-PROB,7,CNN
2,MEAN-PROB,7,BiLSTM
3,MEAN-PROB,7,ConvLSTM


In [94]:
getEnsemblesResultsSummary('HTA',2).iloc[:,-2:]

Unnamed: 0,F1_multi,EMR
0,0.704866,0.689375
1,0.755084,0.718125
2,0.74396,0.694375
3,0.76895,0.6925


# Ensambles de arquitectura mixta

In [38]:
import pickle
from pandas import DataFrame

def automaticMixEnsembleEvaluation(task):
  ## Load test data
  print('Loading test data...')
  X_test, Y_test = loadTestData(task, 'CNN')
  print()

  ## get configurations info
  configIds, evalMethods = getConfigsInfo(task)

  print('Evaluating mix architecture ensembles')
  results = evaluateMixClassifiersEnsemble(X_test, Y_test, task, configIds, evalMethods, 
                                verbose=False)  

  results_file = f'./Results/final/ensembles/mix-architecture-{task}.df'

  with open(results_file, 'wb') as file_handler:
    pickle.dump(results, file_handler)

  print()

  return results

### evaluateClassifiersEnsemble()

In [22]:

from pandas import DataFrame
from  tensorflow.keras.utils import to_categorical

def evaluateMixClassifiersEnsemble(X_test, Y_test, task, config_ids, eval_methods_list, 
                                verbose=False):  

  n_classifiers = len(config_ids)                                                                 

  evaluations_record = list()

  if task=='HTA':
    n_classes = 5
  else:
    n_classes = 2

  # we'll use classes_probs_sum and classes_votes_sum 
  # to generate two different ensembles 
  if task=='HTA':
    classes_probs_sum = np.zeros((len(Y_test),n_classes))
  else:
    classes_probs_sum = np.zeros((len(Y_test),1))

  classes_votes_sum = np.zeros((len(Y_test),n_classes))

  # process samples with the different classiffiers
  for config_ID, eval_method in zip(config_ids, eval_methods_list):
    weights_file = f'global_model_{eval_method}.hdf5'
    trained_model = loadPretrainedModel(config_ID, weights_file)

    # make predictions on X_test samples
    classes_probs = trained_model.predict(X_test)

    # turn the classes_probs into classes predictions
    labels_predictions_array = getClassesPredictions(classes_probs, task)
    
    classes_probs_sum += classes_probs
    classes_votes_sum += to_categorical(labels_predictions_array, num_classes=n_classes)
    
  # EVALUATE THE ENSEMBLES
  if verbose:
    print('\nEvaluating MEAN PROBABILITIES ENSEMBLE')

  # turn the accumulated probabilities into classes predictions
  labels_predictions_array = getClassesPredictions(classes_probs_sum/n_classifiers, task)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], labels_predictions_array)
  
  model_results_dict = {'model_type': 'MEAN-PROB',
                        'n_classifiers':n_classifiers,
                        'task': task,
                        **evaluation}

  evaluations_record.append(model_results_dict) 

  if verbose:
    print('\nEvaluating MAJORITY VOTING ENSEMBLE')

  # turn the classes votes into an array of classes predictions
  labels_predictions_array = np.array([classes_votes.argmax() for classes_votes in classes_votes_sum]).reshape(-1,1)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], labels_predictions_array)
  
  model_results_dict = {'model_type': 'VOTING',
                        'n_classifiers':n_classifiers,
                        'task': task,
                        **evaluation}

  evaluations_record.append(model_results_dict) 

  evaluations_results_df = DataFrame(evaluations_record)

  return evaluations_results_df

## utils

In [23]:
def getConfigsInfo(task):
  configIds = list()
  evalMethods = list()

  for architecture in ['CNN','BiLSTM','ConvLSTM']:
    config_ids, eval_methods = getBestModelsInfo(task, architecture, type_list=['G'])
    configIds.extend(config_ids)
    evalMethods.extend(eval_methods)
  
  return configIds, evalMethods

In [24]:
def getBestModelsInfo(task, architecture, type_list=['G']):
  if task=='HTA':
    sorting_metric='EMR'
  else:
    sorting_metric='f1-macro'

  results_list = [f'./Results/final/{architecture}-1_{task}_TEST.df',
                  f'./Results/final/{architecture}-2_{task}_TEST.df']

  results_df = results_utils.mergeDataFrames(results_list, '.delete_this.df') 

  type_mask = results_df.model_type.isin(type_list)
  best_models_info = results_df.loc[type_mask].sort_values(by=sorting_metric, ascending=False).iloc[:2]

  conf_ids = best_models_info.conf_id.to_list()
  architectures_list = best_models_info.architecture.to_list()
  select_methods_list = [mapToEvalMethod(arch) for arch in architectures_list]
  
  return conf_ids, select_methods_list

In [25]:
def mapToEvalMethod(architecture):
  if architecture[-1]=='1':
    return 'A'
  elif architecture[-1]=='2':
    return 'B'
  

In [39]:
automaticMixEnsembleEvaluation('HS')

Loading test data...
FastText 3 - Spanish Unannotated Corpora
Encoding Format: EMB-SEQ

Process complete
1600 test instances retrieved

encodings_dim = (55, 300)

Evaluating mix architecture ensembles



Unnamed: 0,model_type,n_classifiers,task,acc,prec,recall,f1-macro
0,MEAN-PROB,6,HS,0.760625,0.753183,0.755206,0.754081
1,VOTING,6,HS,0.75625,0.749054,0.744487,0.746356


In [13]:
getBestModelResults('HS','SNN')

(     conf_id model_type architecture       acc      prec    recall  f1-macro
 37  IYwHpkvM          G        SNN-1  0.728125  0.720030  0.712653  0.715188
 67  WTzJBbAA          G        SNN-2  0.719375  0.711455  0.714233  0.712524,
 ['IYwHpkvM', 'WTzJBbAA'],
 ['A', 'B'])