In [None]:
# Mount drive connection if required
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Install required libraries
!pip install --upgrade gensim
!pip install transformers

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import ast
from collections import Counter
import re

import nltk
import numpy as np
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score, roc_auc_score, precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from tabulate import tabulate

from gensim.models import Word2Vec

from transformers import BertTokenizer, BertModel, FeatureExtractionPipeline

## Cosas BETO

In [None]:
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model.eval()
BETO_features = FeatureExtractionPipeline(model, tokenizer, device=0)

## Carga de datos

In [None]:
# Interventions code - label dictionary
categories_number_words = {
        1: "Apoyo Pedagógico en asignaturas",
        3: "Apoyo pedagógico personal",
        4: "Tutoría entre pares",
        7: "Hacer a la familia partícipe del proceso",
        8: "Apoyo psicóloga(o)",
        9: "Apoyo fonoaudióloga(o)",
        10: "Apoyo Educador(a) Diferencial",
        11: "Apoyo Kinesióloga(o)",
        12: "Apoyo Médico General",
        13: "Apoyo Terapeuta Ocupacional",
        14: "Control Neurólogo",
        15: "Apoyo Interdisciplinario",
        16: "Adecuación curricular de acceso",
        17: "Adecuación curricular de objetivos"
    }
# Inverse above dictionary
categories_words_number = {v: k for k, v in categories_number_words.items()}

# Diagnoses label - code dictionary
diagnoses_codes = {
    "Trastorno específico del lenguaje": 0,
    "Trastorno por déficit atencional": 1,
    "Dificultad específica de aprendizaje": 2,
    "Discapacidad intelectual": 3,
    "Discapacidad visual": 4,
    "Trastorno del espectro autista": 5,
    "Discapacidad auditiva - Hipoacusia": 6,
    "Funcionamiento intelectual limítrofe": 7,
    "Síndrome de Down": 8,
    "Trastorno motor": 9,
    "Multidéficit": 10,
    "Retraso global del desarrollo": 11
}
stopwords = set(nltk.corpus.stopwords.words('spanish'))

### Datasets preseparados

In [None]:
# Datasets built with Stratified iterative algorithm
train_dataset = pd.read_csv('gdrive/My Drive/magister/train_ds.csv', keep_default_na=False)
val_dataset = pd.read_csv('gdrive/My Drive/magister/val_ds.csv', keep_default_na=False)
test_dataset = pd.read_csv('gdrive/My Drive/magister/test_ds.csv', keep_default_na=False)

In [None]:
train_dataset.shape

(1836, 29)

In [None]:
train_dataset.head()

Unnamed: 0,Encoded Diagnosis,Diagnosis,All perceptions,Special Education Teacher Perceptions,Speech Therapist Perceptions,Psychologist Perceptions,Medical Perceptions,Amount of SET perceptions,Amount of ST perceptions,Amount of P perceptions,Amount of M perceptions,Has SET perceptions,Has ST perceptions,Has P perceptions,Has M perceptions,Apoyo Pedagógico en asignaturas,Apoyo pedagógico personal,Tutoría entre pares,Hacer a la familia partícipe del proceso,Apoyo psicóloga(o),Apoyo fonoaudióloga(o),Apoyo Educador(a) Diferencial,Apoyo Kinesióloga(o),Apoyo Médico General,Apoyo Terapeuta Ocupacional,Control Neurólogo,Apoyo Interdisciplinario,Adecuación curricular de acceso,Adecuación curricular de objetivos
0,3,Discapacidad intelectual,"En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...","En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...",,-Establece relaciones sociales principalmente ...,Estudiante con atenciones medicas debido a su ...,1,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0
1,3,Discapacidad intelectual,"En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...","En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...",,-Establece relaciones sociales principalmente ...,-Estudiante con atenciones medicas debido a su...,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0
2,3,Discapacidad intelectual,"Habilidades (Cognitivas, comunicativas, social...","Habilidades (Cognitivas, comunicativas, social...",,Comunica sus deseos y emociones de manera mas ...,Controles periodicos al dia. Equipo multidisci...,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,1,1,0
3,3,Discapacidad intelectual,"scar, no presenta dificultades en el desarroll...","scar, no presenta dificultades en el desarroll...",,"[ESTUDIANTE], es un niño entusiasta y colabora...","Estudiante con un estado sano de salud, pero c...",1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,1,0,0,1,1,1,0
4,9,Trastorno motor,En [ESTUDIANTE] se evidencia preferencia por e...,En [ESTUDIANTE] se evidencia preferencia por e...,,Estudiante cariñoso y respetuoso con sus compa...,,1,0,2,0,1,0,1,0,1,0,0,0,0,0,1,1,0,1,0,1,0,0


### Datasets creados

In [None]:
# Not stratified dataset
students_strats = pd.read_csv('gdrive/My Drive/magister/anonimized_dataset.csv')
columns = students_strats.columns
for var in columns:
    if var != 'Diagnoses' and var != 'Index':
        students_strats[var] = students_strats[var].apply(ast.literal_eval)
students_strats.columns

Index(['Index', 'Diagnoses', 'Special Education Teacher Perceptions',
       'Psychological Perceptions', 'Medical Perceptions',
       'Speech Therapist Perceptions', 'Written Strategies',
       'Encoded Strategies'],
      dtype='object')

In [None]:
# Building features arrays for not stratified dataset

joined_perceptions = []
joined_set_perceptions = []
joined_st_perceptions = []
joined_p_perceptions = []
joined_m_perceptions = []

amount_set_perceptions = []
amount_st_perceptions = []
amount_p_perceptions = []
amount_m_perceptions = []

has_set = []
has_st = []
has_p = []
has_m = []

for index in students_strats['Index']:
  text = ""
  
  set_text = ""
  st_text = ""
  p_text = ""
  m_text = ""
  
  amount_set = 0
  for perception in students_strats['Special Education Teacher Perceptions'][index]:
    text += perception + " "
    set_text += perception + " "
    amount_set += 1

  amount_st = 0
  for perception in students_strats['Speech Therapist Perceptions'][index]:
    text += perception + " "
    st_text += perception + " "
    amount_st += 1

  amount_p = 0
  for perception in students_strats['Psychological Perceptions'][index]:
    text += perception + " "
    p_text += perception + " "
    amount_p += 1

  amount_m = 0
  for perception in students_strats['Medical Perceptions'][index]:
    text += perception + " "
    m_text += perception + " "
    amount_m += 1

  joined_perceptions.append(text)

  joined_set_perceptions.append(set_text)
  joined_st_perceptions.append(st_text)
  joined_p_perceptions.append(p_text)
  joined_m_perceptions.append(m_text)

  amount_set_perceptions.append(amount_set)
  amount_st_perceptions.append(amount_st)
  amount_p_perceptions.append(amount_p)
  amount_m_perceptions.append(amount_m)

  has_set.append(1 if amount_set > 0 else 0)
  has_st.append(1 if amount_st > 0 else 0)
  has_p.append(1 if amount_p > 0 else 0)
  has_m.append(1 if amount_m > 0 else 0)

In [None]:
sum(has_st)

525

In [None]:
strat_present = {
    strat: [] for strat in list(categories_words_number.keys())
}

diag_codes = []
for index in students_strats['Index']:
  diag = students_strats['Diagnoses'][index]
  diag_codes.append(diagnoses_codes[diag])
  for strat_number in categories_number_words:
    if strat_number in students_strats['Encoded Strategies'][index]:
      strat_present[categories_number_words[strat_number]].append(1)
    else:
      strat_present[categories_number_words[strat_number]].append(0)

In [None]:
new_dataset_to_export = {
    'Encoded Diagnosis': diag_codes,
    'Diagnosis': students_strats['Diagnoses'],
    'All perceptions': joined_perceptions,
    'Special Education Teacher Perceptions': joined_set_perceptions,
    'Speech Therapist Perceptions': joined_st_perceptions,
    'Psychologist Perceptions': joined_p_perceptions,
    'Medical Perceptions': joined_m_perceptions,
    'Amount of SET perceptions': amount_set_perceptions,
    'Amount of ST perceptions': amount_st_perceptions,
    'Amount of P perceptions': amount_p_perceptions,
    'Amount of M perceptions': amount_m_perceptions,
    'Has SET perceptions': has_set,
    'Has ST perceptions': has_st,
    'Has P perceptions': has_p,
    'Has M perceptions': has_m,
}
x_keys = list(new_dataset_to_export.keys())
new_dataset_to_export.update(strat_present)

In [None]:
for key in new_dataset_to_export:
  print(key, len(new_dataset_to_export[key]))

Encoded Diagnosis 3035
Diagnosis 3035
All perceptions 3035
Special Education Teacher Perceptions 3035
Speech Therapist Perceptions 3035
Psychologist Perceptions 3035
Medical Perceptions 3035
Amount of SET perceptions 3035
Amount of ST perceptions 3035
Amount of P perceptions 3035
Amount of M perceptions 3035
Has SET perceptions 3035
Has ST perceptions 3035
Has P perceptions 3035
Has M perceptions 3035
Apoyo Pedagógico en asignaturas 3035
Apoyo pedagógico personal 3035
Tutoría entre pares 3035
Hacer a la familia partícipe del proceso 3035
Apoyo psicóloga(o) 3035
Apoyo fonoaudióloga(o) 3035
Apoyo Educador(a) Diferencial 3035
Apoyo Kinesióloga(o) 3035
Apoyo Médico General 3035
Apoyo Terapeuta Ocupacional 3035
Control Neurólogo 3035
Apoyo Interdisciplinario 3035
Adecuación curricular de acceso 3035
Adecuación curricular de objetivos 3035


## Experimentos

In [None]:
y_keys = list(categories_words_number.keys())
# df = pd.DataFrame(data=new_dataset_to_export)
# X = df
# Y = df[y_keys]
X_train = train_dataset.drop(y_keys, axis=1)
Y_train = train_dataset[y_keys]
X_val = val_dataset.drop(y_keys, axis=1)
Y_val = val_dataset[y_keys]
X_test = test_dataset.drop(y_keys, axis=1)
Y_test = test_dataset[y_keys]
strats_amounts = {
              'Adecuación curricular de acceso': 2264,
              'Hacer a la familia partícipe del proceso': 2048,
              'Apoyo Interdisciplinario': 1441, 
              'Apoyo Educador(a) Diferencial': 1311,
              'Apoyo pedagógico personal': 1240,
              'Apoyo fonoaudióloga(o)': 378,
              'Apoyo psicóloga(o)': 588,
              'Apoyo Terapeuta Ocupacional': 153,
              'Tutoría entre pares': 350,
              'Control Neurólogo': 63,
              'Apoyo Médico General': 64,
              'Apoyo Kinesióloga(o)': 32,
              'Adecuación curricular de objetivos': 281,
              'Apoyo Pedagógico en asignaturas': 1314
}
# most_unbalanced_strategies = [strategy for strategy in y_keys if (strats_amounts[strategy] < len(X)*0.15 or strats_amounts[strategy] > len(X)*0.85)]
# less_unbalanced_strategies = [strategy for strategy in y_keys if strategy not in most_unbalanced_strategies]
most_unbalanced_strategies = [strategy for strategy in y_keys if (strats_amounts[strategy] < (len(X_train)+len(X_val)+len(X_test))*0.15 or strats_amounts[strategy] > (len(X_train)+len(X_val)+len(X_test))*0.85)]
less_unbalanced_strategies = [strategy for strategy in y_keys if strategy not in most_unbalanced_strategies]

In [None]:
experiments = [
               [],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Amount of SET perceptions', 'type': 'numeric'},
                {'name': 'Amount of ST perceptions', 'type': 'numeric'},
                {'name': 'Amount of P perceptions', 'type': 'numeric'},
                {'name': 'Amount of M perceptions', 'type': 'numeric'}
                ],
               [{'name': 'Has SET perceptions', 'type': 'binary'},
                {'name': 'Has ST perceptions', 'type': 'binary'},
                {'name': 'Has P perceptions', 'type': 'binary'},
                {'name': 'Has M perceptions', 'type': 'binary'}
                ],
               [{'name': 'All perceptions', 'type': 'string'}],
               [{'name': 'Medical Perceptions', 'type': 'string'}],
               [{'name': 'Psychologist Perceptions', 'type': 'string'}],
               [{'name': 'Speech Therapist Perceptions', 'type': 'string'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'string'}],
               [{'name': 'All perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'}],
               [{'name': 'Medical Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Psychologist Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Speech Therapist Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Other Perceptions', 'type': 'binary_labels'}],
               [{'name': 'Other Perceptions', 'type': 'binary_labels'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Other Perceptions', 'type': 'binary_labels'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Other Perceptions', 'type': 'binary_labels'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Pedagógico en asignaturas', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'}, 
                {'name': 'Apoyo Pedagógico en asignaturas', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo pedagógico personal', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo pedagógico personal', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Tutoría entre pares', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Tutoría entre pares', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Hacer a la familia partícipe del proceso', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Hacer a la familia partícipe del proceso', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo psicóloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo psicóloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo fonoaudióloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo fonoaudióloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Educador(a) Diferencial', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo Educador(a) Diferencial', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Kinesióloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo Kinesióloga(o)', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Médico General', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo Médico General', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Terapeuta Ocupacional', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo Terapeuta Ocupacional', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Control Neurólogo', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Control Neurólogo', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Apoyo Interdisciplinario', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Apoyo Interdisciplinario', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Adecuación curricular de acceso', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Adecuación curricular de acceso', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'string'}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}, 
                {'name': 'Adecuación curricular de objetivos', 'type': 'binary_single_label'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'},
                {'name': 'Adecuación curricular de objetivos', 'type': 'binary_single_label'}],
               [{'name': 'All perceptions', 'type': 'embed_string', 'n_features': 100, 'special_token': '', 'transformation': 0}],
               [{'name': 'All perceptions', 'type': 'embed_string', 'n_features': 100, 'special_token': '', 'transformation': 0}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
              #  [{'name': 'All perceptions', 'type': 'BETO_string', 'transformation': 0}],
              #  [{'name': 'All perceptions', 'type': 'BETO_string', 'transformation': 0}, 
              #   {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}]
               ]
experiments_names = [
                     "Most frequent",
                     "Only diagnosis (OHE)",
                     "Amount of perceptions (numeric attributes)",
                     "Has perceptions? (binary attributes)",
                     "All joined perceptions (string attribute)",
                     "Medical perceptions only (string attribute)",
                     "Psychologist perceptions only (string attribute)",
                     "Speech therapist perceptions only (string attribute)",
                     "Special education teacher perceptions only (string attribute)",
                     "All joined perceptions (string attribute) and encoded diagnosis",
                     "Perceptions (string attribute) with special token to differentiate them",
                     "Perceptions (string attribute) with special token and encoded diagnosis",
                     "Medical perceptions + diagnosis",
                     "Psychologist perceptions + diagnosis",
                     "Speech therapist perceptions + diagnosis",
                     "Special education teacher + diagnosis",
                     "All other strategies",
                     "All other strategies + diagnosis",
                     "All joined perceptions + diagnosis + all other strategies",
                     "Perceptions with tokens + diagnosis + all other strategies",
                     "All joined perceptions + diagnosis + Apoyo Pedagógico en asignaturas",
                     "Perceptions with tokens + diagnosis + Apoyo Pedagógico en asignaturas",
                     "All joined perceptions + diagnosis + Apoyo pedagógico personal",
                     "Perceptions with tokens + diagnosis + Apoyo pedagógico personal",
                     "All joined perceptions + diagnosis + Tutoría entre pares",
                     "Perceptions with tokens + diagnosis + Tutoría entre pares",
                     "All joined perceptions + diagnosis + Hacer a la familia partícipe del proceso",
                     "Perceptions with tokens + diagnosis + Hacer a la familia partícipe del proceso",
                     "All joined perceptions + diagnosis + Apoyo psicóloga(o)",
                     "Perceptions with tokens + diagnosis + Apoyo psicóloga(o)",
                     "All joined perceptions + diagnosis + Apoyo fonoaudióloga(o)",
                     "Perceptions with tokens + diagnosis + Apoyo fonoaudióloga(o)",
                     "All joined perceptions + diagnosis + Apoyo Educador(a) Diferencial",
                     "Perceptions with tokens + diagnosis + Apoyo Educador(a) Diferencial",
                     "All joined perceptions + diagnosis + Apoyo Kinesióloga(o)",
                     "Perceptions with tokens + diagnosis + Apoyo Kinesióloga(o)",
                     "All joined perceptions + diagnosis + Apoyo Médico General",
                     "Perceptions with tokens + diagnosis + Apoyo Médico General",
                     "All joined perceptions + diagnosis + Apoyo Terapeuta Ocupacional",
                     "Perceptions with tokens + diagnosis + Apoyo Terapeuta Ocupacional",
                     "All joined perceptions + diagnosis + Control Neurólogo",
                     "Perceptions with tokens + diagnosis + Control Neurólogo",
                     "All joined perceptions + diagnosis + Apoyo Interdisciplinario",
                     "Perceptions with tokens + diagnosis + Apoyo Interdisciplinario",
                     "All joined perceptions + diagnosis + Adecuación curricular de acceso",
                     "Perceptions with tokens + diagnosis + Adecuación curricular de acceso",
                     "All joined perceptions + diagnosis + Adecuación curricular de objetivos",
                     "Perceptions with tokens + diagnosis + Adecuación curricular de objetivos",
                     "All perceptions with word embedding",
                     "All perceptions (word embedding) + diagnosis",
                    #  "All perceptions with BETO (average)",
                    #  "All perceptions (BETO) + diagnosis (average)"
]

stopwords = set(nltk.corpus.stopwords.words('spanish'))

In [None]:
def custom_tokenizer(special_token="", use_clean=True):
  def tokenize(sentence):
    clean_sentence = re.sub(r'[^\w\s[]]', '', str(sentence).lower().strip()) if use_clean else sentence
    if special_token != "":
      return list(map(lambda word: special_token+ "_" + word, clean_sentence.split()))
    else:
      return clean_sentence.split()
  return tokenize

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

In [None]:
def normalize_word_vectors(words, model, vocabulary, num_features):
    
    cap = []
    for word in words:
        if word in vocabulary:
            cap.append(model[word])
    cap_vec = np.sum(np.array(cap), axis=0)
    cap_vec = cap_vec / np.sqrt(cap_vec.dot(cap_vec))
        
    return cap_vec

In [None]:
def average_feature_vectors(vectors):
  feature_vector = np.zeros((len(vectors[0]),),dtype="float64")
  
  for vector in vectors:
    feature_vector = np.add(feature_vector, vector)

  feature_vector = np.divide(feature_vector, len(vectors))
      
  return feature_vector

In [None]:
def normalize_feature_vectors(vectors):
  cap_vec = np.sum(np.array(vectors), axis=0)
  cap_vec = cap_vec / np.sqrt(cap_vec.dot(cap_vec))
        
  return cap_vec

In [None]:
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

In [None]:
new_array = split_list([1,2,3,4], 4)
new_array

[[1], [2], [3], [4]]

In [None]:
def transform_perception_to_feature_vector(perception, mode):
  aproved = False
  divide_exponent = 0
  splitted_perception = perception.split()
  while not aproved:
    amount_aproved = 0
    divided_array = split_list(splitted_perception, 2**divide_exponent)
    separator = " "
    divided_perceptions = [separator.join(words) for words in divided_array]
    for mini_perception in divided_perceptions:
      try:
        perception_features = BETO_features(mini_perception)
        amount_aproved += 1
      except Exception:
        pass 
    if amount_aproved == len(divided_perceptions):
      aproved = True
    else:
      divide_exponent += 1
  partial_modified_vector = []
  for mini_perception in divided_perceptions:
    mini_features = BETO_features(mini_perception)[0]
    if mode == 0:
      partial_modified_vector.append(average_feature_vectors(mini_features))
    if mode == 1:
      partial_modified_vector.append(normalize_feature_vectors(mini_features))
  if mode == 0:
    return average_feature_vectors(partial_modified_vector)
  if mode == 1:
    return normalize_feature_vectors(partial_modified_vector)

In [None]:
def make_top_tables(features, coeffs, top_n=10):
  ordered_coeffs, ordered_features = zip(*sorted(zip(coeffs, features), reverse=True))
  for i in range(top_n if len(features) > top_n else len(features)):
    print(i+1, ordered_features[i], ordered_coeffs[i])

In [None]:
def make_bottom_tables(features, coeffs, top_n=10):
  ordered_coeffs, ordered_features = zip(*sorted(zip(coeffs, features), reverse=True))
  for i in range(top_n if len(features) > top_n else len(features)):
    print(i+1, ordered_features[(i+1)*-1], ordered_coeffs[(i+1)*-1])

In [None]:
def execute_experiments(X_train, Y_train, X_val, Y_val, selected_strategies, experiments):
  all_results = []
  for strategy in selected_strategies:
    i = 0
    print('experimentando para estrategia: '+strategy)
    strat_acc = []
    strat_kappa = []
    strat_f1 = []
    strat_auc = []
    strat_coefs = []
    strat_features = []
    for experiment in experiments:
      print("\t"+str(i+1)+'° experimento')
      i += 1
      # y = df[strategy]
      y_train = Y_train[strategy]
      y_val = Y_val[strategy]
      # X_train, X_test, y_train, y_test = train_test_split(X,
      #                                                     y,
      #                                                     test_size=0.2,
      #                                                     random_state=1)
      # X_train, X_val, y_train, y_val = train_test_split(X_train,
      #                                                   y_train,
      #                                                   stratify=y_train,
      #                                                   test_size=0.25,
      #                                                   random_state=1)
      if len(experiment) > 0:
        classifier = LogisticRegression(penalty='l2', dual=True, solver='liblinear', max_iter=10000)
        X_train_transformed = pd.DataFrame()
        X_val_transformed = pd.DataFrame()
        for input in experiment:
          # Encoding diagnosis as categorical attribute
          if input['type'] == 'categorical_diagnostic':
            enc = OneHotEncoder(handle_unknown='ignore')
            enc.fit(np.asarray(X_train[input['name']].append(X_val[input['name']])).reshape(-1, 1))

            train_arrays = enc.transform(np.asarray(X_train[input['name']]).reshape(-1,1)).toarray()
            temp_train_df = pd.DataFrame(train_arrays, columns=list(diagnoses_codes.keys()))

            val_arrays = enc.transform(np.asarray(X_val[input['name']]).reshape(-1,1)).toarray()
            temp_val_df = pd.DataFrame(val_arrays, columns=list(diagnoses_codes.keys()))
            
            X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
            X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
            del temp_train_df, temp_val_df

          # Copying numeric and binary attributes
          if input['type'] == 'numeric' or input['type'] == 'binary':
            X_train_transformed[input['name']] = X_train[input['name']].to_numpy()
            X_val_transformed[input['name']] = X_val[input['name']].to_numpy()

          # Encoding strings
          if input['type'] == 'string':
            vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=stopwords, ngram_range=(1,3), max_df=0.8, min_df=0.05)
            vectorizer.fit(X_train[input['name']])
            print(vectorizer.get_feature_names())
            train_arrays = vectorizer.transform(X_train[input['name']]).toarray()
            temp_train_df = pd.DataFrame(train_arrays, columns=vectorizer.get_feature_names())
            
            val_arrays = vectorizer.transform(X_val[input['name']]).toarray()
            temp_val_df = pd.DataFrame(val_arrays, columns=vectorizer.get_feature_names())
            
            X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
            X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
            del temp_train_df, temp_val_df

          # Encoding strings with special tokens
          if input['type'] == 'special_string':
            tokenizer = custom_tokenizer(input['special_token'])
            vectorizer = CountVectorizer(strip_accents='unicode',
                                        tokenizer=tokenizer,
                                        lowercase=True,
                                        stop_words=list(map(lambda word: input['special_token']+ "_" + word, stopwords)),
                                        ngram_range=(1,3),
                                        max_df=0.8,
                                        min_df=0.05)
            vectorizer.fit(X_train[input['name']])
            train_arrays = vectorizer.transform(X_train[input['name']]).toarray()
            temp_train_df = pd.DataFrame(train_arrays, columns=vectorizer.get_feature_names())
            
            val_arrays = vectorizer.transform(X_val[input['name']]).toarray()
            temp_val_df = pd.DataFrame(val_arrays, columns=vectorizer.get_feature_names())
            
            X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
            X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
            del temp_train_df, temp_val_df

          if input['type'] == "binary_labels":
            for other_label in list(Y_train.columns):
              if other_label != strategy:
                X_train_transformed[other_label] = Y_train[other_label].to_numpy()
                X_val_transformed[other_label] = Y_val[other_label].to_numpy()

          if input['type'] == "binary_single_label":
              if input['name'] != strategy:
                X_train_transformed[input['name']] = Y_train[input['name']].to_numpy()
                X_val_transformed[input['name']] = Y_val[input['name']].to_numpy()

          if input['type'] == "embed_string":
            c_tokenizer = custom_tokenizer(input['special_token'])
            embedding_model = Word2Vec(
                list(map(lambda doc: c_tokenizer(doc), X_train[input['name']])),
                min_count=1, 
                window=3, 
                sg=1, 
                size=input['n_features'])
            vocab = embedding_model.wv.vocab
            tokenized_train_perceptions = list(map(lambda doc: c_tokenizer(doc), X_train[input['name']]))
            tokenized_val_perceptions = list(map(lambda doc: c_tokenizer(doc), X_val[input['name']]))
            if input['transformation'] == 0:
              transformed_train_perceptions = list(map(
                  lambda tokenized_perception: normalize_word_vectors(tokenized_perception,
                                                                    embedding_model, 
                                                                    vocab, 
                                                                    input['n_features']), tokenized_train_perceptions))
              transformed_val_perceptions = list(map(
                  lambda tokenized_perception: normalize_word_vectors(tokenized_perception,
                                                                    embedding_model, 
                                                                    vocab, 
                                                                    input['n_features']), tokenized_val_perceptions))
            if input['transformation'] == 1:
              transformed_train_perceptions = list(map(
                  lambda tokenized_perception: average_word_vectors(tokenized_perception,
                                                                    embedding_model, 
                                                                    vocab, 
                                                                    input['n_features']), tokenized_train_perceptions))
              transformed_val_perceptions = list(map(
                  lambda tokenized_perception: average_word_vectors(tokenized_perception,
                                                                    embedding_model, 
                                                                    vocab, 
                                                                    input['n_features']), tokenized_val_perceptions))
            

            temp_train_df = pd.DataFrame(transformed_train_perceptions, columns=[str(i+1)+'_feature' for i in range(input['n_features'])])
            temp_val_df = pd.DataFrame(transformed_val_perceptions, columns=[str(i+1)+'_feature' for i in range(input['n_features'])])
            
            X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
            X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
            del temp_train_df, temp_val_df

          if input['type'] == "BETO_string":

            transformed_train_perceptions = list(map(
                lambda perception: transform_perception_to_feature_vector(perception, input['transformation']), X_train[input['name']]))
            transformed_val_perceptions = list(map(
                lambda perception: transform_perception_to_feature_vector(perception, input['transformation']), X_val[input['name']]))

            temp_train_df = pd.DataFrame(transformed_train_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_train_perceptions[0]))])
            temp_val_df = pd.DataFrame(transformed_val_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_val_perceptions[0]))])
            
            X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
            X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
            del temp_train_df, temp_val_df
              

        classifier.fit(X_train_transformed, y_train)
        y_pred = classifier.predict(X_val_transformed)
        strat_acc.append(round(accuracy_score(y_val, y_pred), 2))
        strat_kappa.append(round(cohen_kappa_score(y_val, y_pred), 2))
        strat_f1.append(round((f1_score(y_val, y_pred, pos_label=1) + f1_score(y_val, y_pred, pos_label=0))/2, 2))
        strat_auc.append(round(roc_auc_score(y_val, y_pred), 2))
        strat_coefs.append(classifier.coef_)
        strat_features.append(list(X_train_transformed.columns))
        del X_train_transformed, X_val_transformed
      else:
        classifier = DummyClassifier(strategy="most_frequent")
        classifier.fit(X_train['Diagnosis'], y_train)
        y_pred = classifier.predict(X_val['Diagnosis'])
        strat_acc.append(round(accuracy_score(y_val, y_pred), 2))
        strat_kappa.append(round(cohen_kappa_score(y_val, y_pred), 2))
        strat_f1.append(round((f1_score(y_val, y_pred, pos_label=1) + f1_score(y_val, y_pred, pos_label=0))/2, 2))
        strat_auc.append(round(roc_auc_score(y_val, y_pred), 2))

    all_results.append({'name': strategy, 'accs': strat_acc, 'kappas': strat_kappa, 'f1s': strat_f1, 'aucs': strat_auc, 'coefs': strat_coefs, 'features': strat_features})
  return all_results

In [None]:
mini_table = tabulate([["hola", "chao", "perro"]], tablefmt="plain")

table = tabulate([["label", mini_table, mini_table]], headers=["nombre", "1", "2"])
print(table)

nombre    1                  2
--------  -----------------  -----------------
label     hola  chao  perro  hola  chao  perro


In [None]:
def make_table_for_python(strats_results, difference_val=0.04):
  header_columns = ['Experiment names']
  experiments = [[name] for name in experiments_names] 
  for strat_result in strats_results:
    marks = ["(*)", "(**)"] 
    sorted_accs = sorted(list(set(strat_result['accs'])), reverse=True)
    sorted_kappas = sorted(list(set(strat_result['kappas'])), reverse=True)
    sorted_f1s = sorted(list(set(strat_result['f1s'])), reverse=True)
    sorted_aucs = sorted(list(set(strat_result['aucs'])), reverse=True)
    for i in range(len(strat_result['accs'])):
      exp_results = []
      chosen_mark = ""
      if strat_result['accs'][i] == sorted_accs[0]:
        if len(sorted_accs) > 1 and strat_result['accs'][i] >= sorted_accs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      exp_results.append((str(strat_result['accs'][i]) if strat_result['accs'][i]>0 else "0.00")+chosen_mark)

      chosen_mark = ""
      if strat_result['kappas'][i] == sorted_kappas[0]:
        if len(sorted_kappas) > 1 and strat_result['kappas'][i] >= sorted_kappas[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      exp_results.append((str(strat_result['kappas'][i]) if strat_result['kappas'][i]>0 else "0.00")+chosen_mark)

      chosen_mark = ""
      if strat_result['f1s'][i] == sorted_f1s[0]:
        if len(sorted_f1s) > 1 and strat_result['f1s'][i] >= sorted_f1s[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      exp_results.append((str(strat_result['f1s'][i]) if strat_result['f1s'][i]>0 else "0.00")+chosen_mark)

      chosen_mark = ""
      if strat_result['aucs'][i] == sorted_aucs[0]:
        if len(sorted_aucs) > 1 and strat_result['aucs'][i] >= sorted_aucs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      exp_results.append((str(strat_result['aucs'][i]) if strat_result['aucs'][i]>0 else "0.00")+chosen_mark)
      experiments[i].append(tabulate([exp_results], tablefmt="plain"))
    header_columns.append(
        strat_result['name'])
  return tabulate(experiments, headers=header_columns)

In [None]:
def make_table_for_latex(strats_results, difference_val=0.04):
  header_columns = ['Experiment names']
  experiments = [[name] for name in experiments_names] 
  experiments_means = [[0, 0, 0, 0] for name in experiments_names]
  for strat_result in strats_results:
    marks = ["(*)", "(**)"] 
    sorted_accs = sorted(list(set(strat_result['accs'])), reverse=True)
    sorted_kappas = sorted(list(set(strat_result['kappas'])), reverse=True)
    sorted_f1s = sorted(list(set(strat_result['f1s'])), reverse=True)
    sorted_aucs = sorted(list(set(strat_result['aucs'])), reverse=True)
    for i in range(len(strat_result['accs'])):
      chosen_mark = ""
      if strat_result['accs'][i] == sorted_accs[0]:
        if len(sorted_accs) > 1 and strat_result['accs'][i] >= sorted_accs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['accs'][i])+chosen_mark)
      experiments_means[i][0] += strat_result['accs'][i]

      chosen_mark = ""
      if strat_result['kappas'][i] == sorted_kappas[0]:
        if len(sorted_kappas) > 1 and strat_result['kappas'][i] >= sorted_kappas[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['kappas'][i])+chosen_mark)
      experiments_means[i][1] += strat_result['kappas'][i]

      chosen_mark = ""
      if strat_result['f1s'][i] == sorted_f1s[0]:
        if len(sorted_f1s) > 1 and strat_result['f1s'][i] >= sorted_f1s[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['f1s'][i])+chosen_mark)
      experiments_means[i][2] += strat_result['f1s'][i]

      chosen_mark = ""
      if strat_result['aucs'][i] == sorted_aucs[0]:
        if len(sorted_aucs) > 1 and strat_result['aucs'][i] >= sorted_aucs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['aucs'][i])+chosen_mark)
      experiments_means[i][3] += strat_result['aucs'][i]
    header_columns.extend(
        [strat_result['name']+(" ("+str(strats_amounts[strat_result['name']])+" cases)"), 'Kappa', 'F1', 'AUC'])
  for i in range(len(experiments_means)):
    experiments[i].append(round(experiments_means[i][0]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][1]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][2]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][3]/len(strats_results), 2))
  header_columns.extend(
        ["Means", 'Kappa', 'F1', 'AUC'])
  return tabulate(experiments, headers=header_columns)

In [None]:
most_unbalanced_results = execute_experiments(X_train, Y_train, X_val, Y_val, most_unbalanced_strategies, experiments)

In [None]:
most_unbalanced_results[0].keys()

dict_keys(['name', 'accs', 'kappas', 'f1s', 'aucs', 'coefs', 'features'])

In [None]:
for result in most_unbalanced_results:
  strat_name = result['name']
  print('--------- Resultados para '+strat_name+' -----------')
  for i in range(len(result['coefs'])):
    print('++++++++++ Experimento: '+experiments_names[i+1] + '+++++++++++')
    make_top_tables(result['features'][i], result['coefs'][i][0], 10)
    print()

In [None]:
for result in most_unbalanced_results:
  strat_name = result['name']
  print('--------- Resultados para '+strat_name+' -----------')
  for i in range(len(result['coefs'])):
    print('++++++++++ Experimento: '+experiments_names[i+1] + '+++++++++++')
    make_bottom_tables(result['features'][i], result['coefs'][i][0])
    print()

In [None]:
print(make_table_for_python(most_unbalanced_results))

In [None]:
print('Number of features')
for strategy in most_unbalanced_results:
  print(strategy['name'])
  for i in range(1, len(experiments_names)):
    print(experiments_names[i], ':', len(strategy['features'][i-1]))
  print()

In [None]:
print(make_table_for_latex(most_unbalanced_results))

In [None]:
less_unbalanced_results = execute_experiments(X_train, Y_train, X_val, Y_val, less_unbalanced_strategies, experiments)

In [None]:
print(make_table_for_latex(less_unbalanced_results))

In [None]:
all_strats_results = execute_experiments(X_train, Y_train, X_val, Y_val, y_keys, experiments)

In [None]:
print(make_table_for_latex(all_strats_results))

In [None]:
ml_experiments = [
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Amount of SET perceptions', 'type': 'numeric'},
                {'name': 'Amount of ST perceptions', 'type': 'numeric'},
                {'name': 'Amount of P perceptions', 'type': 'numeric'},
                {'name': 'Amount of M perceptions', 'type': 'numeric'}
                ],
               [{'name': 'Has SET perceptions', 'type': 'binary'},
                {'name': 'Has ST perceptions', 'type': 'binary'},
                {'name': 'Has P perceptions', 'type': 'binary'},
                {'name': 'Has M perceptions', 'type': 'binary'}
                ],
               [{'name': 'All perceptions', 'type': 'string'}],
               [{'name': 'Medical Perceptions', 'type': 'string'}],
               [{'name': 'Psychologist Perceptions', 'type': 'string'}],
               [{'name': 'Speech Therapist Perceptions', 'type': 'string'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'string'}],
               [{'name': 'All perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'}],
               [{'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'},
                {'name': 'Special Education Teacher Perceptions', 'type': 'special_string', 'special_token': 'set'}, 
                {'name': 'Speech Therapist Perceptions', 'type': 'special_string', 'special_token': 'st'},
                {'name': 'Psychologist Perceptions', 'type': 'special_string', 'special_token': 'p'},
                {'name': 'Medical Perceptions', 'type': 'special_string', 'special_token': 'm'}],
               [{'name': 'Medical Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Psychologist Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Speech Therapist Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'Special Education Teacher Perceptions', 'type': 'string'}, {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
               [{'name': 'All perceptions', 'type': 'embed_string', 'n_features': 100, 'special_token': '', 'transformation': 0}],
               [{'name': 'All perceptions', 'type': 'embed_string', 'n_features': 100, 'special_token': '', 'transformation': 0}, 
                {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
              #  [{'name': 'All perceptions', 'type': 'BETO_string', 'transformation': 0}],
              #  [{'name': 'All perceptions', 'type': 'BETO_string', 'transformation': 0}, 
              #   {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
              #  [{'name': 'Special Education Teacher Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'set'},
              #   {'name': 'Speech Therapist Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'st'},
              #   {'name': 'Psychologist Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'p'},
              #   {'name': 'Medical Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'm'}
              #   ],
              #  [{'name': 'Special Education Teacher Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'set'},
              #   {'name': 'Speech Therapist Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'st'},
              #   {'name': 'Psychologist Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'p'},
              #   {'name': 'Medical Perceptions', 'type': 'BETO_string', 'transformation': 0, 'code': 'm'}, 
              #   {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}
              #   ],
              #  [{'name': 'All perceptions', 'type': 'sentence_embedding'}],
              #  [{'name': 'All perceptions', 'type': 'sentence_embedding'}, 
              #   {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}],
              #  [{'name': 'Special Education Teacher Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Speech Therapist Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Psychologist Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Medical Perceptions', 'type': 'sentence_embedding'}],
              #  [{'name': 'Special Education Teacher Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Speech Therapist Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Psychologist Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Medical Perceptions', 'type': 'sentence_embedding'},
              #   {'name': 'Encoded Diagnosis', 'type': 'categorical_diagnostic'}
              #   ]
               ]
ml_experiments_names = [
                     "Only diagnosis (OHE)",
                     "Amount of perceptions (numeric attributes)",
                     "Has perceptions? (binary attributes)",
                     "All joined perceptions (string attribute)",
                     "Medical perceptions only (string attribute)",
                     "Psychologist perceptions only (string attribute)",
                     "Speech therapist perceptions only (string attribute)",
                     "Special education teacher perceptions only (string attribute)",
                     "All joined perceptions (string attribute) and encoded diagnosis",
                     "Perceptions (string attribute) with special token to differentiate them",
                     "Perceptions (string attribute) with special token and encoded diagnosis",
                     "Medical perceptions + diagnosis",
                     "Psychologist perceptions + diagnosis",
                     "Speech therapist perceptions + diagnosis",
                     "Special education teacher + diagnosis",
                     "All perceptions with word embedding",
                     "All perceptions (word embedding) + diagnosis",
                    #  "All perceptions with BETO (average)",
                    #  "All perceptions (BETO) + diagnosis (average)",
                    #  "Different perceptions with BETO (average)",
                    #  "Different perceptions (BETO) + diagnosis (average)",
                    #  "Sentence BERT embedding",
                    #  "Sentence + diagnosis",
                    #  "Different perceptions",
                    #  "Different + diagnosis"
]

In [None]:
def classifier_chain_experiments(X_train, Y_train, X_val, Y_val, y_keys, experiments, order):
  preds = []
  i = 0
  for experiment in experiments:
    print("\t"+str(i+1)+'° experimento')
    i += 1
    if len(experiment) > 0:
      base_lr = LogisticRegression(penalty='l2', dual=True, solver='liblinear', max_iter=10000)
      classifier = ClassifierChain(base_lr, order=order)
      X_train_transformed = pd.DataFrame()
      X_val_transformed = pd.DataFrame()
      for input in experiment:
        # Encoding diagnosis as categorical attribute
        if input['type'] == 'categorical_diagnostic':
          enc = OneHotEncoder(handle_unknown='ignore')
          enc.fit(np.asarray(X_train[input['name']].append(X_val[input['name']])).reshape(-1, 1))

          train_arrays = enc.transform(np.asarray(X_train[input['name']]).reshape(-1,1)).toarray()
          temp_train_df = pd.DataFrame(train_arrays, columns=list(diagnoses_codes.keys()))

          val_arrays = enc.transform(np.asarray(X_val[input['name']]).reshape(-1,1)).toarray()
          temp_val_df = pd.DataFrame(val_arrays, columns=list(diagnoses_codes.keys()))
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df

        # Copying numeric and binary attributes
        if input['type'] == 'numeric' or input['type'] == 'binary':
          X_train_transformed[input['name']] = X_train[input['name']].to_numpy()
          X_val_transformed[input['name']] = X_val[input['name']].to_numpy()

        # Encoding strings
        if input['type'] == 'string':
          vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=stopwords, ngram_range=(1,3), max_df=0.8, min_df=0.05)
          vectorizer.fit(X_train[input['name']])
          print(vectorizer.get_feature_names())
          train_arrays = vectorizer.transform(X_train[input['name']]).toarray()
          temp_train_df = pd.DataFrame(train_arrays, columns=vectorizer.get_feature_names())
          
          val_arrays = vectorizer.transform(X_val[input['name']]).toarray()
          temp_val_df = pd.DataFrame(val_arrays, columns=vectorizer.get_feature_names())
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df

        # Encoding strings with special tokens
        if input['type'] == 'special_string':
          tokenzr = custom_tokenizer(input['special_token'])
          vectorizer = CountVectorizer(strip_accents='unicode',
                                      tokenizer=tokenzr,
                                      lowercase=True,
                                      stop_words=list(map(lambda word: input['special_token']+ "_" + word, stopwords)),
                                      ngram_range=(1,3),
                                      max_df=0.8,
                                      min_df=0.05)
          vectorizer.fit(X_train[input['name']])
          train_arrays = vectorizer.transform(X_train[input['name']]).toarray()
          temp_train_df = pd.DataFrame(train_arrays, columns=vectorizer.get_feature_names())
          
          val_arrays = vectorizer.transform(X_val[input['name']]).toarray()
          temp_val_df = pd.DataFrame(val_arrays, columns=vectorizer.get_feature_names())
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df

        # if input['type'] == "binary_labels":
        #   for other_label in list(Y_train.columns):
        #     if other_label != strategy:
        #       X_train_transformed[other_label] = Y_train[other_label].to_numpy()
        #       X_val_transformed[other_label] = Y_val[other_label].to_numpy()

        # if input['type'] == "binary_single_label":
        #     if input['name'] != strategy:
        #       X_train_transformed[input['name']] = Y_train[input['name']].to_numpy()
        #       X_val_transformed[input['name']] = Y_val[input['name']].to_numpy()

        if input['type'] == "embed_string":
          c_tokenizer = custom_tokenizer(input['special_token'])
          embedding_model = Word2Vec(
              list(map(lambda doc: c_tokenizer(doc), X_train[input['name']])),
              min_count=1, 
              window=3, 
              sg=1, 
              size=input['n_features'])
          vocab = embedding_model.wv.vocab
          tokenized_train_perceptions = list(map(lambda doc: c_tokenizer(doc), X_train[input['name']]))
          tokenized_val_perceptions = list(map(lambda doc: c_tokenizer(doc), X_val[input['name']]))
          if input['transformation'] == 0:
            transformed_train_perceptions = list(map(
                lambda tokenized_perception: normalize_word_vectors(tokenized_perception,
                                                                  embedding_model, 
                                                                  vocab, 
                                                                  input['n_features']), tokenized_train_perceptions))
            transformed_val_perceptions = list(map(
                lambda tokenized_perception: normalize_word_vectors(tokenized_perception,
                                                                  embedding_model, 
                                                                  vocab, 
                                                                  input['n_features']), tokenized_val_perceptions))
          if input['transformation'] == 1:
            transformed_train_perceptions = list(map(
                lambda tokenized_perception: average_word_vectors(tokenized_perception,
                                                                  embedding_model, 
                                                                  vocab, 
                                                                  input['n_features']), tokenized_train_perceptions))
            transformed_val_perceptions = list(map(
                lambda tokenized_perception: average_word_vectors(tokenized_perception,
                                                                  embedding_model, 
                                                                  vocab, 
                                                                  input['n_features']), tokenized_val_perceptions))
          

          temp_train_df = pd.DataFrame(transformed_train_perceptions, columns=[str(i+1)+'_feature' for i in range(input['n_features'])])
          temp_val_df = pd.DataFrame(transformed_val_perceptions, columns=[str(i+1)+'_feature' for i in range(input['n_features'])])
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df

        if input['type'] == "BETO_string":

          transformed_train_perceptions = list(map(
              lambda perception: transform_perception_to_feature_vector(perception, input['transformation']), X_train[input['name']]))
          transformed_val_perceptions = list(map(
              lambda perception: transform_perception_to_feature_vector(perception, input['transformation']), X_val[input['name']]))

          temp_train_df = pd.DataFrame(transformed_train_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_train_perceptions[0]))])
          temp_val_df = pd.DataFrame(transformed_val_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_val_perceptions[0]))])
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df

        if input['type'] == "sentence_embedding":

          transformed_train_perceptions = list(map(
              lambda perception: sentence_model.encode(perception), X_train[input['name']]))
          transformed_val_perceptions = list(map(
              lambda perception: sentence_model.encode(perception), X_val[input['name']]))

          temp_train_df = pd.DataFrame(transformed_train_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_train_perceptions[0]))])
          temp_val_df = pd.DataFrame(transformed_val_perceptions, columns=[str(i+1)+'_feature' for i in range(len(transformed_val_perceptions[0]))])
          
          X_train_transformed = pd.concat([X_train_transformed, temp_train_df], axis=1)
          X_val_transformed = pd.concat([X_val_transformed, temp_val_df], axis=1)
          del temp_train_df, temp_val_df
            

      classifier.fit(X_train_transformed, Y_train)
      Y_pred = classifier.predict(X_val_transformed)
      # strat_acc.append(round(accuracy_score(y_val, y_pred), 2))
      # strat_kappa.append(round(cohen_kappa_score(y_val, y_pred), 2))
      # strat_f1.append(round((f1_score(y_val, y_pred, pos_label=1) + f1_score(y_val, y_pred, pos_label=0))/2, 2))
      # strat_auc.append(round(roc_auc_score(y_val, y_pred), 2))
      # strat_coefs.append(classifier.coef_)
      # strat_features.append(list(X_train_transformed.columns))
      preds.append(Y_pred)
      del X_train_transformed, X_val_transformed
  return preds

In [None]:
order = []
for strat in most_unbalanced_strategies:
    order.append(list(Y_val.columns).index(strat))
for strat in less_unbalanced_strategies:
    order.append(list(Y_val.columns).index(strat))
print(order)
ml_results = classifier_chain_experiments(X_train, Y_train, X_val, Y_val, y_keys, ml_experiments, order)

In [None]:
def make_table_for_latex_multi_label_experiments(experiment_preds, Y_vals, labels_to_eval, exps_names, difference_val=0.04):
  all_results = []
  for strategy in labels_to_eval:
    i = 0
    strat_acc = []
    strat_kappa = []
    strat_f1 = []
    strat_auc = []
    for preds in experiment_preds:
      label_index = list(Y_vals.columns).index(strategy)
      y_val = Y_vals[strategy]
      y_pred = preds[:, label_index]
      strat_acc.append(round(accuracy_score(y_val, y_pred), 2))
      strat_kappa.append(round(cohen_kappa_score(y_val, y_pred), 2))
      strat_f1.append(round((f1_score(y_val, y_pred, pos_label=1) + f1_score(y_val, y_pred, pos_label=0))/2, 2))
      strat_auc.append(round(roc_auc_score(y_val, y_pred), 2))

    all_results.append({'name': strategy, 'accs': strat_acc, 'kappas': strat_kappa, 'f1s': strat_f1, 'aucs': strat_auc})
  strats_results = all_results
  
  header_columns = ['Experiment names']
  experiments = [[name] for name in exps_names] 
  experiments_means = [[0, 0, 0, 0] for name in exps_names]
  for strat_result in strats_results:
    marks = ["(*)", "(**)"] 
    sorted_accs = sorted(list(set(strat_result['accs'])), reverse=True)
    sorted_kappas = sorted(list(set(strat_result['kappas'])), reverse=True)
    sorted_f1s = sorted(list(set(strat_result['f1s'])), reverse=True)
    sorted_aucs = sorted(list(set(strat_result['aucs'])), reverse=True)
    for i in range(len(strat_result['accs'])):
      chosen_mark = ""
      if strat_result['accs'][i] == sorted_accs[0]:
        if len(sorted_accs) > 1 and strat_result['accs'][i] >= sorted_accs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['accs'][i])+chosen_mark)
      experiments_means[i][0] += strat_result['accs'][i]

      chosen_mark = ""
      if strat_result['kappas'][i] == sorted_kappas[0]:
        if len(sorted_kappas) > 1 and strat_result['kappas'][i] >= sorted_kappas[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['kappas'][i])+chosen_mark)
      experiments_means[i][1] += strat_result['kappas'][i]

      chosen_mark = ""
      if strat_result['f1s'][i] == sorted_f1s[0]:
        if len(sorted_f1s) > 1 and strat_result['f1s'][i] >= sorted_f1s[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['f1s'][i])+chosen_mark)
      experiments_means[i][2] += strat_result['f1s'][i]

      chosen_mark = ""
      if strat_result['aucs'][i] == sorted_aucs[0]:
        if len(sorted_aucs) > 1 and strat_result['aucs'][i] >= sorted_aucs[1] + difference_val:
          chosen_mark = marks[1]
        else:
          chosen_mark = marks[0]
      experiments[i].append(str(strat_result['aucs'][i])+chosen_mark)
      experiments_means[i][3] += strat_result['aucs'][i]
    header_columns.extend(
        [strat_result['name']+(" ("+str(strats_amounts[strat_result['name']])+" cases)"), 'Kappa', 'F1', 'AUC'])
  for i in range(len(experiments_means)):
    experiments[i].append(round(experiments_means[i][0]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][1]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][2]/len(strats_results), 2))
    experiments[i].append(round(experiments_means[i][3]/len(strats_results), 2))
  header_columns.extend(
        ["Means", 'Kappa', 'F1', 'AUC'])
  return tabulate(experiments, headers=header_columns)

In [None]:
print(make_table_for_latex_multi_label_experiments(ml_results, Y_val, less_unbalanced_strategies, ml_experiments_names))

In [None]:
print(make_table_for_latex_multi_label_experiments(ml_results, Y_val, most_unbalanced_strategies, ml_experiments_names))

In [None]:
print(make_table_for_latex_multi_label_experiments(ml_results, Y_val, y_keys, ml_experiments_names))

In [None]:
most_unbalanced_strategies

['Tutoría entre pares',
 'Apoyo fonoaudióloga(o)',
 'Apoyo Kinesióloga(o)',
 'Apoyo Médico General',
 'Apoyo Terapeuta Ocupacional',
 'Control Neurólogo',
 'Adecuación curricular de objetivos']