# Ajustes iniciales

## Conexión a google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd ./drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [3]:
%pwd

'/content/drive/MyDrive/Colab Notebooks'

## Instalar microTC

In [4]:
%pip install git+https://github.com/felipeRmBr/microtc.git#egg=microtc

Collecting microtc
  Cloning https://github.com/felipeRmBr/microtc.git to /tmp/pip-install-xehaahae/microtc
  Running command git clone -q https://github.com/felipeRmBr/microtc.git /tmp/pip-install-xehaahae/microtc
Building wheels for collected packages: microtc
  Building wheel for microtc (setup.py) ... [?25l[?25hdone
  Created wheel for microtc: filename=microtc-2.2.8-cp37-none-any.whl size=60798 sha256=64d2cabfe411c88c52a7ba01fa2e41c4782955f4d9ed562e1d1acb7ac5e7299d
  Stored in directory: /tmp/pip-ephem-wheel-cache-q1qicgpg/wheels/73/00/4e/a4e2fc519599a41145f1740a75c80f390d4bcef72fed066f56
Successfully built microtc
Installing collected packages: microtc
Successfully installed microtc-2.2.8


## Importar modulos *_utils


In [5]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')
from my_utils import dataset_utils
from my_utils import eval_utils
from my_utils import svc_utils #(depends on microtc)

In [6]:
import pickle

# Utils

## evaluateClassifiersOnTest()

In [19]:
def evaluateClassifiersOnTest(task, conf_ids, tm_ids, kernel):
  print('Loading test data')
  X_test, Y_test = dataset_utils.importTestDataForSVM()
  X_test = X_test['text'].to_list()

  print('Evaluating classifiers...')
  print()
  merged_results = DataFrame()
  for config_ID, tm_ID in zip(conf_ids, tm_ids):
    print(f'config_ID: {config_ID},  tm_ID: {tm_ID}')

    merged_results = merged_results.append(evaluateTrainedModels(X_test, Y_test, task, config_ID, tm_ID, f'SVC-{kernel}', 7),
                          ignore_index=True)

  results_file = f'./Results/final/SVC-{kernel}_{task}_TEST.df'
  with open(results_file, 'wb') as file_handler:
    pickle.dump(merged_results, file_handler) 

  print()
  print(f'Results saved to: {results_file}')

### evaluateTrainedModels()

In [15]:

from pandas import DataFrame
import numpy as np
from os import path, makedirs
from  tensorflow.keras.utils import to_categorical

def evaluateTrainedModels(X_test, Y_test, task, 
                          config_ID, tm_ID, arch_label, n_folds,
                          verbose=False):

  evaluations_record = list()

  if task=='HTA':
    n_classes = 5
  else:
    n_classes = 2

  # lasses_votes_sum to generate an ensemble model 
  classes_votes_sum = np.zeros((len(Y_test),n_classes))

  # EVALUATE THE FOLDS CLASSIFIERS ---------------------------------------------
  for fold_idx in range(n_folds):
    if verbose:
      print('\nEvaluating data-fold {}'.format(fold_idx))

    # load the text_model 
    with open(f'./text_models/{tm_ID}/F{fold_idx}.tm', 'rb') as file_handler:
      text_model = pickle.load(file_handler)
  
    # load the classifier
    with open(f'./trained_models/{config_ID}/F{fold_idx}.svc', 'rb') as file_handler:
      classifier = pickle.load(file_handler)

    # transform X_train and X_test using the fitted text_model
    if verbose:
      print("Transforming the test messages")
    X_test_transformed = text_model.transform(X_test)

    # Evaluate on the test data
    # make predictions
    labels_predictions = classifier.predict(X_test_transformed)

    classes_votes_sum += to_categorical(labels_predictions, num_classes=n_classes)

    # evaluate the preditions
    evaluation = evaluatePredictions(task, Y_test[task], labels_predictions)
    
    model_results_dict = {'conf_id': config_ID,
                          'tm_ID': tm_ID,
                          'model_type': 'F',
                          'architecture': arch_label,
                          **evaluation}

    evaluations_record.append(model_results_dict) 

  # EVALUATE THE FULL-DATASET MODEL --------------------------------------------
  if verbose:
      print('\nEvaluating full-dataset model'.format(fold_idx))

  # load the text model 
  with open(f'./text_models/{tm_ID}/G.tm', 'rb') as file_handler:
    text_model = pickle.load(file_handler)

  # load the classifier
  with open(f'./trained_models/{config_ID}/G.svc', 'rb') as file_handler:
    classifier = pickle.load(file_handler)

  # transform X_train and X_test using the fitted text_model
  if verbose:
      print("Transforming the test messages")
  X_test_transformed = text_model.transform(X_test)

  # Evaluate on the test data
  # make predictions
  labels_predictions = classifier.predict(X_test_transformed)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], labels_predictions)
  
  model_results_dict = {'conf_id': config_ID,
                        'tm_ID': tm_ID,
                        'model_type': 'G',
                        'architecture': arch_label,
                        **evaluation}                       

  evaluations_record.append(model_results_dict) 

  # EVALUATE THE ENSEMBLE
  if verbose:
      print('\nEvaluating MAJORITY VOTING ENSEMBLE')

  # turn the classes votes into an array of classes predictions
  class_pred_array = np.array([classes_votes.argmax() for classes_votes in classes_votes_sum]).reshape(-1,1)

  # evaluate the preditions
  evaluation = evaluatePredictions(task, Y_test[task], class_pred_array)
  
  model_results_dict = {'conf_id': config_ID,
                        'tm_ID': tm_ID,
                        'model_type': 'E1',
                        'architecture': arch_label,
                        **evaluation}

  evaluations_record.append(model_results_dict) 

  evaluations_results_df = DataFrame(evaluations_record)

  return evaluations_results_df

### evaluatePredictions()

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluatePredictions(task, val_labels, pred_labels, verbose=False):

  if task=='HTA':

    # get the correspondig predicted and val (target) labels for each task
    pred_HS_labels, pred_TR_labels, pred_AG_labels = dataset_utils.getLabelsPerTask(pred_labels)
    val_HS_labels, val_TR_labels, val_AG_labels = dataset_utils.getLabelsPerTask(val_labels)

    # compute ACC, PREC, RECALL and F1 metrics
    HS_acc, HS_prec, HS_recall, HS_f1 = compute_metrics(val_HS_labels, pred_HS_labels)
    AG_acc, AG_prec, AG_recall, AG_f1 = compute_metrics(val_AG_labels, pred_AG_labels)   
    TR_acc, TR_prec, TR_recall, TR_f1 = compute_metrics(val_TR_labels, pred_TR_labels)

    F1_multi = (HS_f1+ AG_f1 + TR_f1)/3

    EMR = computeEMR(list(zip(val_HS_labels, val_TR_labels, val_AG_labels)),
                    list(zip(pred_HS_labels, pred_TR_labels, pred_AG_labels)))

    results_dict = {'HS_acc':HS_acc,
                    'HS_prec':HS_prec,
                    'HS_recall':HS_recall,
                    'HS_f1':HS_f1,
                    'AG_acc':AG_acc,
                    'AG_prec':AG_prec,
                    'AG_recall':AG_recall,
                    'AG_f1':AG_f1,
                    'TR_acc':TR_acc,
                    'TR_prec':TR_prec,
                    'TR_recall':TR_recall,
                    'TR_f1':TR_f1,
                    'F1_multi':F1_multi,
                    'EMR':EMR}

    if verbose:
      print('EMR = ', EMR)
      print('F1_multi = ', F1_multi)
      print()

    return results_dict

  if task in ['HS', 'TR', 'AG']:
    # compute ACC, PREC, RECALL and F1 metrics
    acc, prec, recall, f1_macro = compute_metrics(val_labels, pred_labels)

    results_dict = {'acc':acc,
                    'prec':prec,
                    'recall':recall,
                    'f1-macro':f1_macro}

    if verbose:
      print('Acc = ', acc)
      print('F1_macro = ', f1_macro)
      print()

    return results_dict

def compute_metrics(target, predicted):
    accuracy = accuracy_score(target, predicted)
    precision = precision_score(target, predicted, average="macro")
    recall = recall_score(target, predicted, average="macro")
    f1 = f1_score(target, predicted, average="macro")
    
    return accuracy, precision, recall, f1    

def computeEMR(test_labels, pred_labels):
  total_instances = len(test_labels)
  exact_match_count= 0
  for gold, pred in zip(test_labels, pred_labels):
    #print(gold, pred)
    if gold == pred:
      exact_match_count += 1

  return exact_match_count/total_instances


### labels_utils

In [17]:
#**************************     getLabelsPerTask()    **************************
def getLabelsPerTask(HTA_labels):

    HS_labels = list()
    TR_labels = list()
    AG_labels = list()

    for HTA_label in HTA_labels:
        HS_label, TR_label, AG_label = mapTo3DimsFormat(HTA_label)

        HS_labels.append(HS_label)
        TR_labels.append(TR_label)
        AG_labels.append(AG_label)

    HS_labels = np.array(HS_labels).reshape(-1,1)
    TR_labels = np.array(TR_labels).reshape(-1,1)
    AG_labels = np.array(AG_labels).reshape(-1,1)

    return (HS_labels, TR_labels, AG_labels)

#**************************     mapTo3DimsFormat()    ************************** 
def mapTo3DimsFormat(AB_label):
  '''
  Maps label in five_classes_format to 3 dims labeling.

    0 -> (0,0,0)  [HT = 0, TR = 0, AG = 0]
    1 -> (1,0,0)  [HT = 1, TR = 0, AG = 0]
    2 -> (1,0,1)  [HT = 1, TR = 0, AG = 1]
    3 -> (1,1,0)  [HT = 1, TR = 1, AG = 0]
    4 -> (1,1,1)  [HT = 1, TR = 1, AG = 1]

  inpunt:
  label    - int, label in five_classes_format

  output:
  (H,T,A)  - ints tuple, labeling in 3 dims format

  '''
  if AB_label == 0:
    return(0,0,0)

  elif AB_label == 1:
    return(1,0,0)

  elif AB_label == 2:
    return(1,0,1)

  elif AB_label == 3:
    return(1,1,0)

  elif AB_label == 4:
    return(1,1,1)


# EVALUATE TRAINED MODELS ON TEST DATA

## HS - linear kernel

In [20]:
configurations_ids = ['GQZgqx', 'hwmIFg', 'tgCmGg', 'fwZpdd', 'wavzCG']
textmodels_ids = ['CgOJHi', 'bVkhBU', 'wxNaSh', 'WHTRQA', 'iTLnGg']

evaluateClassifiersOnTest('HS', 
                          configurations_ids,
                          textmodels_ids,
                          'LINEAR')

Loading test data
Evaluating classifiers...

config_ID: GQZgqx,  tm_ID: CgOJHi
config_ID: hwmIFg,  tm_ID: bVkhBU
config_ID: tgCmGg,  tm_ID: wxNaSh
config_ID: fwZpdd,  tm_ID: WHTRQA
config_ID: wavzCG,  tm_ID: iTLnGg

Results saved to: ./Results/final/SVC-LINEAR_HS_TEST.df


## HS - sigmoid kernel

In [21]:
configurations_ids = ['bKgjxA', 'ddJgKe', 'qnVrDo', 'ZSDGIu', 'eJPzqU']
textmodels_ids = ['fabKkA', 'pRduGU', 'SoAVcr', 'bcYlZN', 'Nilquo']

evaluateClassifiersOnTest('HS', 
                          configurations_ids,
                          textmodels_ids,
                          'SIGMOID')

Loading test data
Evaluating classifiers...

config_ID: bKgjxA,  tm_ID: fabKkA
config_ID: ddJgKe,  tm_ID: pRduGU
config_ID: qnVrDo,  tm_ID: SoAVcr
config_ID: ZSDGIu,  tm_ID: bcYlZN
config_ID: eJPzqU,  tm_ID: Nilquo

Results saved to: ./Results/final/SVC-SIGMOID_HS_TEST.df


## TR - linear kernel

In [22]:
configurations_ids = ['OFznBK', 'peKOTV', 'NnIIjU', 'PFWLCY', 'cqPNUg']
textmodels_ids = ['pNqujl', 'MWagAt', 'yhmmRG', 'aiCCPb', 'qRFfIc']

evaluateClassifiersOnTest('TR', 
                          configurations_ids,
                          textmodels_ids,
                          'LINEAR')

Loading test data
Evaluating classifiers...

config_ID: OFznBK,  tm_ID: pNqujl
config_ID: peKOTV,  tm_ID: MWagAt
config_ID: NnIIjU,  tm_ID: yhmmRG
config_ID: PFWLCY,  tm_ID: aiCCPb
config_ID: cqPNUg,  tm_ID: qRFfIc

Results saved to: ./Results/final/SVC-LINEAR_TR_TEST.df


## TR - sigmoid kernel

In [23]:
configurations_ids = ['ATGBKd', 'AjLonL', 'PDNKQY', 'ulFaFM', 'RVYzWK']
textmodels_ids = ['vKQhlC', 'tnRmpK', 'uIjYYW', 'xzUkgN', 'rdPtoi']

evaluateClassifiersOnTest('TR', 
                          configurations_ids,
                          textmodels_ids,
                          'SIGMOID')

Loading test data
Evaluating classifiers...

config_ID: ATGBKd,  tm_ID: vKQhlC
config_ID: AjLonL,  tm_ID: tnRmpK
config_ID: PDNKQY,  tm_ID: uIjYYW
config_ID: ulFaFM,  tm_ID: xzUkgN
config_ID: RVYzWK,  tm_ID: rdPtoi

Results saved to: ./Results/final/SVC-SIGMOID_TR_TEST.df


## AG - linear kernel

In [24]:
configurations_ids = ['wJPAHW', 'ejjQUP', 'ahlXxo', 'wxHotu', 'naYSFx']
textmodels_ids = ['JopFMW', 'OqAhDb', 'NflZGL', 'lrdGCb', 'lQVifv']

evaluateClassifiersOnTest('AG', 
                          configurations_ids,
                          textmodels_ids,
                          'LINEAR')

Loading test data
Evaluating classifiers...

config_ID: wJPAHW,  tm_ID: JopFMW
config_ID: ejjQUP,  tm_ID: OqAhDb
config_ID: ahlXxo,  tm_ID: NflZGL
config_ID: wxHotu,  tm_ID: lrdGCb
config_ID: naYSFx,  tm_ID: lQVifv

Results saved to: ./Results/final/SVC-LINEAR_AG_TEST.df


## AG - sigmoid kernel

In [25]:
configurations_ids = ['cpagTE', 'jlugvH', 'GMUaEy', 'bwiAzB', 'kMJPac']
textmodels_ids = ['bOsLGf', 'TCbkrJ', 'OKbEvO', 'iIVaiv', 'sJCvyU']

evaluateClassifiersOnTest('AG', 
                          configurations_ids,
                          textmodels_ids,
                          'SIGMOID')

Loading test data
Evaluating classifiers...

config_ID: cpagTE,  tm_ID: bOsLGf
config_ID: jlugvH,  tm_ID: TCbkrJ
config_ID: GMUaEy,  tm_ID: OKbEvO
config_ID: bwiAzB,  tm_ID: iIVaiv
config_ID: kMJPac,  tm_ID: sJCvyU

Results saved to: ./Results/final/SVC-SIGMOID_AG_TEST.df


## HTA - linear kernel

In [26]:
configurations_ids = ['eKpEEc', 'lFupON', 'XulowM', 'JonAxF', 'Etgzct']
textmodels_ids = ['wKlLLQ', 'pTurDN', 'HsOAlr', 'CmFmfJ', 'YXTXdt']

evaluateClassifiersOnTest('HTA', 
                          configurations_ids,
                          textmodels_ids,
                          'LINEAR')

Loading test data
Evaluating classifiers...

config_ID: eKpEEc,  tm_ID: wKlLLQ


  _warn_prf(average, modifier, msg_start, len(result))


config_ID: lFupON,  tm_ID: pTurDN
config_ID: XulowM,  tm_ID: HsOAlr
config_ID: JonAxF,  tm_ID: CmFmfJ
config_ID: Etgzct,  tm_ID: YXTXdt

Results saved to: ./Results/final/SVC-LINEAR_HTA_TEST.df


## HTA - sigmoid kernel

In [27]:
configurations_ids = ['KAcOYq', 'bmCbAB', 'pcPfNL', 'wNOXbH', 'hVEaUj']
textmodels_ids = ['djwYeF', 'TqHwoB', 'sUlSKy', 'CgdcXu', 'oDWZfJ']

evaluateClassifiersOnTest('HTA', 
                          configurations_ids,
                          textmodels_ids,
                          'SIGMOID')

Loading test data
Evaluating classifiers...

config_ID: KAcOYq,  tm_ID: djwYeF
config_ID: bmCbAB,  tm_ID: TqHwoB
config_ID: pcPfNL,  tm_ID: sUlSKy
config_ID: wNOXbH,  tm_ID: CgdcXu
config_ID: hVEaUj,  tm_ID: oDWZfJ

Results saved to: ./Results/final/SVC-SIGMOID_HTA_TEST.df


# EVALUATE TRAINED MODELS ON VALIDATION DATA

## Utils

### evaluateTrainedModelsOnValidationData()

In [None]:

from pandas import DataFrame
import numpy as np
from os import path, makedirs
from  tensorflow.keras.utils import to_categorical

def evaluateTrainedModelsOnValidationData(X_train, Y_train, task, 
                                          config_ID, tm_ID, arch_label, n_folds):
  
  data_splits = getDataSplits(X_train, Y_train, task, n_folds)

  evaluations_record = list()

  # EVALUATE THE FOLDS CLASSIFIERS ---------------------------------------------
  for fold_idx in range(n_folds):
    print('\nEvaluating data-fold {}'.format(fold_idx))

    # load the text_model 
    with open(f'./text_models/{tm_ID}/F{fold_idx}.tm', 'rb') as file_handler:
      text_model = pickle.load(file_handler)
  
    # load the classifier
    with open(f'./trained_models/{config_ID}/F{fold_idx}.svc', 'rb') as file_handler:
      classifier = pickle.load(file_handler)

    # transform X_train and X_test using the fitted text_model
    print("Transforming the validation messages")
    x_train, y_train, x_val, y_val = data_splits[fold_idx]
     
    # x_train = text_model.transform(x_train)
    x_val_transformed   = text_model.transform(x_val)

    # Evaluate on the validation data
    # make predictions
    labels_predictions = classifier.predict(x_val_transformed)

    # evaluate the preditions
    evaluation = evaluatePredictions(task, y_val, labels_predictions)
    
    model_results_dict = {'conf_id': config_ID,
                          'tm_ID': tm_ID,
                          'model_type': 'F',
                          'architecture': arch_label,
                          **evaluation}

    evaluations_record.append(model_results_dict) 

  evaluations_results_df = DataFrame(evaluations_record)

  return evaluations_results_df

### getDataSplits()

In [None]:
def getDataSplits(X_train, Y_train, task, n_folds):
  '''
  splits train data into k folds for cross validation.

  input
  X_train  - pd.DataFrame, X_train.columns() = ['text','kfold']
  Y_train  - pd.DataFrame, Y_train.columns() = ['HS', 'TR', 'AG', 'ATG', 'kfold']
  task     - str, valid tasks = ['HS', 'TR', 'AG', 'ATG']
  n_folds  - int, number of folds for cross validation

  output
  data_splits_list  - list with n_folds different splits of the train data

  '''
  data_splits_list = []

  for K in range (n_folds):
    train_mask  = X_train.kfold==K
    x_train = X_train.loc[train_mask,'text'].to_list()
    y_train = Y_train.loc[train_mask, task].to_list()

    val_mask = Y_train.kfold!=K
    x_val = X_train.loc[val_mask,'text'].to_list()
    y_val = Y_train.loc[val_mask, task].to_list()

    data_splits_list.append( (x_train, y_train, x_val, y_val) )

  return data_splits_list


### evaluatePredictions()

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluatePredictions(task, val_labels, pred_labels):

  if task=='HTA':

    # get the correspondig predicted and val (target) labels for each task
    pred_HS_labels, pred_TR_labels, pred_AG_labels = dataset_utils.getLabelsPerTask(pred_labels)
    val_HS_labels, val_TR_labels, val_AG_labels = dataset_utils.getLabelsPerTask(val_labels)

    # compute the different metrics
    A_acc = accuracy_score(val_HS_labels, pred_HS_labels)
    B1_acc = accuracy_score(val_TR_labels, pred_TR_labels)
    B2_acc = accuracy_score(val_AG_labels, pred_AG_labels)

    A_f1 = f1_score(val_HS_labels, pred_HS_labels, average="macro")
    B1_f1 = f1_score(val_TR_labels, pred_TR_labels, average="macro")
    B2_f1 = f1_score(val_AG_labels, pred_AG_labels, average="macro")

    F1_multi = (A_f1 + B1_f1 + B2_f1)/3

    EMR = computeEMR(list(zip(val_HS_labels, val_TR_labels, val_AG_labels)),
                    list(zip(pred_HS_labels, pred_TR_labels, pred_AG_labels)))

    results_dict = {'A_acc':A_acc,
                    'B1_acc':B1_acc,
                    'B2_acc':B2_acc,
                    'A1_f1':A_f1,
                    'B1_f1':B1_f1,
                    'B2_f1':B2_f1,
                    'F1_multi':F1_multi,
                    'EMR':EMR}

    print('EMR = ', EMR)
    print('F1_multi = ', F1_multi)
    print()

    return results_dict

  if task=='HS':

    # get the correspondig predicted and val (target) labels for each task
    pred_HS_labels = pred_labels
    val_HS_labels = val_labels

    # compute the different metrics
    A_acc = accuracy_score(val_HS_labels, pred_HS_labels)
    A_f1 = f1_score(val_HS_labels, pred_HS_labels, average="macro")

    results_dict = {'A_acc':A_acc,
                    'A1_f1':A_f1}

    print('Acc = ', A_acc)
    print('F1_macro = ', A_f1)
    print()

    return results_dict
    

def computeEMR(test_labels, pred_labels):
  total_instances = len(test_labels)
  exact_match_count= 0
  for gold, pred in zip(test_labels, pred_labels):
    #print(gold, pred)
    if gold == pred:
      exact_match_count += 1

  return exact_match_count/total_instances

### labels_utils

In [None]:
#**************************     getLabelsPerTask()    **************************
def getLabelsPerTask(HTA_labels):

    HS_labels = list()
    TR_labels = list()
    AG_labels = list()

    for HTA_label in HTA_labels:
        HS_label, TR_label, AG_label = mapTo3DimsFormat(HTA_label)

        HS_labels.append(HS_label)
        TR_labels.append(TR_label)
        AG_labels.append(AG_label)

    HS_labels = np.array(HS_labels).reshape(-1,1)
    TR_labels = np.array(TR_labels).reshape(-1,1)
    AG_labels = np.array(AG_labels).reshape(-1,1)

    return (HS_labels, TR_labels, AG_labels)

#**************************     mapTo3DimsFormat()    ************************** 
def mapTo3DimsFormat(AB_label):
  '''
  Maps label in five_classes_format to 3 dims labeling.

    0 -> (0,0,0)  [HT = 0, TR = 0, AG = 0]
    1 -> (1,0,0)  [HT = 1, TR = 0, AG = 0]
    2 -> (1,0,1)  [HT = 1, TR = 0, AG = 1]
    3 -> (1,1,0)  [HT = 1, TR = 1, AG = 0]
    4 -> (1,1,1)  [HT = 1, TR = 1, AG = 1]

  inpunt:
  label    - int, label in five_classes_format

  output:
  (H,T,A)  - ints tuple, labeling in 3 dims format

  '''
  if AB_label == 0:
    return(0,0,0)

  elif AB_label == 1:
    return(1,0,0)

  elif AB_label == 2:
    return(1,0,1)

  elif AB_label == 3:
    return(1,1,0)

  elif AB_label == 4:
    return(1,1,1)


## HS - linear kernel

In [None]:
X_test, Y_test = dataset_utils.importTestDataForSVM()
X_test = X_test['text'].to_list()
# Y_test = Y_test['HTA'].to_list()

In [None]:
configurations_ids = ['GQZgqx', 'hwmIFg', 'tgCmGg', 'fwZpdd', 'wavzCG']
textmodels_ids = ['CgOJHi', 'bVkhBU', 'wxNaSh', 'WHTRQA', 'iTLnGg']

merged_results = DataFrame()
for config_ID, tm_ID in zip(configurations_ids, textmodels_ids):
  print(f'config_ID: {config_ID},  tm_ID: {tm_ID}\n')

  merged_results = merged_results.append(evaluateTrainedModels(X_test, Y_test, 'HS', config_ID, tm_ID, 'SVC-HS', 7),
                        ignore_index=True)

with open('./Results/SVC_HS_batch_1.df', 'wb') as file_handler:
  pickle.dump(merged_results, file_handler)                        

config_ID: GQZgqx,  tm_ID: CgOJHi


Evaluating data-fold 0
Transforming the test messages
Acc =  0.689375
F1_macro =  0.6797771970920612


Evaluating data-fold 1
Transforming the test messages
Acc =  0.715
F1_macro =  0.6996240360055925


Evaluating data-fold 2
Transforming the test messages
Acc =  0.70875
F1_macro =  0.7031374422678771


Evaluating data-fold 3
Transforming the test messages
Acc =  0.711875
F1_macro =  0.6976542379439833


Evaluating data-fold 4
Transforming the test messages
Acc =  0.709375
F1_macro =  0.6941482419073639


Evaluating data-fold 5
Transforming the test messages
Acc =  0.725
F1_macro =  0.7116720400513747


Evaluating data-fold 6
Transforming the test messages
Acc =  0.70875
F1_macro =  0.6988569812997294


Evaluating full-dataset model
Transforming the test messages
Acc =  0.7225
F1_macro =  0.7207683273299572


Evaluating MAJORITY VOTING ENSEMBLE
Acc =  0.753125
F1_macro =  0.7401904420503413

config_ID: hwmIFg,  tm_ID: bVkhBU


Evaluating data-fold 0


## HS - sigmoid kernel

In [None]:
X_test, Y_test = dataset_utils.importTestDataForSVM()
X_test = X_test['text'].to_list()
# Y_test = Y_test['HTA'].to_list()

In [None]:
configurations_ids = ['bKgjxA', 'ddJgKe', 'qnVrDo', 'ZSDGIu', 'eJPzqU']
textmodels_ids = ['fabKkA', 'pRduGU', 'SoAVcr', 'bcYlZN', 'Nilquo']

merged_results = DataFrame()
for config_ID, tm_ID in zip(configurations_ids, textmodels_ids):
  print(f'config_ID: {config_ID},  tm_ID: {tm_ID}\n')

  merged_results = merged_results.append(evaluateTrainedModels(X_test, Y_test, 'HS', config_ID, tm_ID, 'SVC-HS', 7),
                        ignore_index=True)

with open('./Results/SVC_HS_batch_2.df', 'wb') as file_handler:
  pickle.dump(merged_results, file_handler)    

## HTA - linear kernel

In [None]:
X_train, Y_train = dataset_utils.importTrainDataForSVM()

In [None]:
configurations_ids = ['eKpEEc', 'lFupON', 'XulowM', 'JonAxF', 'Etgzct']
textmodels_ids = ['wKlLLQ', 'pTurDN', 'HsOAlr', 'CmFmfJ', 'YXTXdt']

from pandas import DataFrame
merged_results = DataFrame()
for config_ID, tm_ID in zip(configurations_ids, textmodels_ids):
  print(f'config_ID: {config_ID},  tm_ID: {tm_ID}\n')

  merged_results = merged_results.append(evaluateTrainedModelsOnValidationData(X_train, Y_train, 
                                                                               'HTA', 
                                                                               config_ID, 
                                                                               tm_ID, 
                                                                               'SVC-LINEAR', 7),
                        ignore_index=True)


In [None]:
with open('./Results/final/SVC-LINEAR_HTA_VALIDATION.df', 'wb') as file_handler:
  pickle.dump(merged_results, file_handler) 

In [None]:
merged_results.head()

Unnamed: 0,conf_id,tm_ID,model_type,architecture,A_acc,B1_acc,B2_acc,A1_f1,B1_f1,B2_f1,F1_multi,EMR
0,KAcOYq,djwYeF,F,SVC-LINEAR,0.764527,0.878413,0.791599,0.738408,0.823373,0.750901,0.770894,0.733256
1,KAcOYq,djwYeF,F,SVC-LINEAR,0.755193,0.886348,0.801867,0.726217,0.840688,0.759528,0.775478,0.733489
2,KAcOYq,djwYeF,F,SVC-LINEAR,0.765049,0.878208,0.797247,0.741764,0.829786,0.760284,0.777278,0.736118
3,KAcOYq,djwYeF,F,SVC-LINEAR,0.764349,0.884974,0.800513,0.736022,0.836543,0.759775,0.777447,0.737517
4,KAcOYq,djwYeF,F,SVC-LINEAR,0.758049,0.872842,0.793047,0.728733,0.81704,0.749146,0.764973,0.731685


## HTA - sigmoid kernel

In [None]:
X_train, Y_train = dataset_utils.importTrainDataForSVM()

In [None]:
configurations_ids = ['KAcOYq', 'bmCbAB', 'pcPfNL', 'wNOXbH', 'hVEaUj']
textmodels_ids = ['djwYeF', 'TqHwoB', 'sUlSKy', 'CgdcXu', 'oDWZfJ']

from pandas import DataFrame
merged_results = DataFrame()
for config_ID, tm_ID in zip(configurations_ids, textmodels_ids):
  print(f'config_ID: {config_ID},  tm_ID: {tm_ID}\n')

  merged_results = merged_results.append(evaluateTrainedModelsOnValidationData(X_train, Y_train, 
                                                                               'HTA', 
                                                                               config_ID, 
                                                                               tm_ID, 
                                                                               'SVC-SIGMOID', 7),
                        ignore_index=True)


In [None]:
with open('./Results/final/SVC-SIGMOID_HTA_VALIDATION.df', 'wb') as file_handler:
  pickle.dump(merged_results, file_handler) 

In [None]:
with open('./Results/final/SVC-SIGMOID_HTA_VALIDATION.df', 'rb') as file_handler:
  merged_results = pickle.load(file_handler) 

In [None]:
merged_results.loc[merged_results.conf_id=='KAcOYq'].mean()

A_acc       0.763633
B1_acc      0.881700
B2_acc      0.798767
A1_f1       0.737089
B1_f1       0.831993
B2_f1       0.758838
F1_multi    0.775973
EMR         0.736766
dtype: float64