In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install --upgrade gensim
!pip install transformers
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
from collections import Counter
import re

import nltk
import numpy as np
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score, roc_auc_score, precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from tabulate import tabulate

from gensim.models import Word2Vec

from sentence_transformers import SentenceTransformer

from transformers import BertTokenizer, BertModel, FeatureExtractionPipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
students_strats = pd.read_csv('gdrive/My Drive/magister/anonimized_dataset.csv')
columns = students_strats.columns
for var in columns:
    if var != 'Diagnoses' and var != 'Index':
        students_strats[var] = students_strats[var].apply(ast.literal_eval)
students_strats.columns

Index(['Index', 'Diagnoses', 'Special Education Teacher Perceptions',
       'Psychological Perceptions', 'Medical Perceptions',
       'Speech Therapist Perceptions', 'Written Strategies',
       'Encoded Strategies'],
      dtype='object')

In [None]:
joined_perceptions = []
joined_set_perceptions = []
joined_st_perceptions = []
joined_p_perceptions = []
joined_m_perceptions = []

amount_set_perceptions = []
amount_st_perceptions = []
amount_p_perceptions = []
amount_m_perceptions = []

has_set = []
has_st = []
has_p = []
has_m = []

for index in students_strats['Index']:
  text = ""
  
  set_text = ""
  st_text = ""
  p_text = ""
  m_text = ""
  
  amount_set = 0
  for perception in students_strats['Special Education Teacher Perceptions'][index]:
    text += perception + " "
    set_text += perception + " "
    amount_set += 1

  amount_st = 0
  for perception in students_strats['Speech Therapist Perceptions'][index]:
    text += perception + " "
    st_text += perception + " "
    amount_st += 1

  amount_p = 0
  for perception in students_strats['Psychological Perceptions'][index]:
    text += perception + " "
    p_text += perception + " "
    amount_p += 1

  amount_m = 0
  for perception in students_strats['Medical Perceptions'][index]:
    text += perception + " "
    m_text += perception + " "
    amount_m += 1

  joined_perceptions.append(text)

  joined_set_perceptions.append(set_text)
  joined_st_perceptions.append(st_text)
  joined_p_perceptions.append(p_text)
  joined_m_perceptions.append(m_text)

  amount_set_perceptions.append(amount_set)
  amount_st_perceptions.append(amount_st)
  amount_p_perceptions.append(amount_p)
  amount_m_perceptions.append(amount_m)

  has_set.append(1 if amount_set > 0 else 0)
  has_st.append(1 if amount_st > 0 else 0)
  has_p.append(1 if amount_p > 0 else 0)
  has_m.append(1 if amount_m > 0 else 0)

In [None]:
categories_number_words = {
        1: "Apoyo Pedagógico en asignaturas",
        3: "Apoyo pedagógico personal",
        4: "Tutoría entre pares",
        7: "Hacer a la familia partícipe del proceso",
        8: "Apoyo psicóloga(o)",
        9: "Apoyo fonoaudióloga(o)",
        10: "Apoyo Educador(a) Diferencial",
        11: "Apoyo Kinesióloga(o)",
        12: "Apoyo Médico General",
        13: "Apoyo Terapeuta Ocupacional",
        14: "Control Neurólogo",
        15: "Apoyo Interdisciplinario",
        16: "Adecuación curricular de acceso",
        17: "Adecuación curricular de objetivos"
    }
categories_words_number = {v: k for k, v in categories_number_words.items()}

diagnoses_codes = {
    "Trastorno específico del lenguaje": 0,
    "Trastorno por déficit atencional": 1,
    "Dificultad específica de aprendizaje": 2,
    "Discapacidad intelectual": 3,
    "Discapacidad visual": 4,
    "Trastorno del espectro autista": 5,
    "Discapacidad auditiva - Hipoacusia": 6,
    "Funcionamiento intelectual limítrofe": 7,
    "Síndrome de Down": 8,
    "Trastorno motor": 9,
    "Multidéficit": 10,
    "Retraso global del desarrollo": 11
}

strat_present = {
    strat: [] for strat in list(categories_words_number.keys())
}
diag_codes = []
for index in students_strats['Index']:
  diag = students_strats['Diagnoses'][index]
  diag_codes.append(diagnoses_codes[diag])
  for strat_number in categories_number_words:
    if strat_number in students_strats['Encoded Strategies'][index]:
      strat_present[categories_number_words[strat_number]].append(1)
    else:
      strat_present[categories_number_words[strat_number]].append(0)

In [None]:
new_dataset_to_export = {
    'Encoded Diagnosis': diag_codes,
    'Diagnosis': students_strats['Diagnoses'],
    'All perceptions': joined_perceptions,
    'Special Education Teacher Perceptions': joined_set_perceptions,
    'Speech Therapist Perceptions': joined_st_perceptions,
    'Psychologist Perceptions': joined_p_perceptions,
    'Medical Perceptions': joined_m_perceptions,
    'Amount of SET perceptions': amount_set_perceptions,
    'Amount of ST perceptions': amount_st_perceptions,
    'Amount of P perceptions': amount_p_perceptions,
    'Amount of M perceptions': amount_m_perceptions,
    'Has SET perceptions': has_set,
    'Has ST perceptions': has_st,
    'Has P perceptions': has_p,
    'Has M perceptions': has_m,
}
x_keys = list(new_dataset_to_export.keys())
new_dataset_to_export.update(strat_present)

In [None]:
for key in new_dataset_to_export:
  print(key, len(new_dataset_to_export[key]))

Encoded Diagnosis 3035
Diagnosis 3035
All perceptions 3035
Special Education Teacher Perceptions 3035
Speech Therapist Perceptions 3035
Psychologist Perceptions 3035
Medical Perceptions 3035
Amount of SET perceptions 3035
Amount of ST perceptions 3035
Amount of P perceptions 3035
Amount of M perceptions 3035
Has SET perceptions 3035
Has ST perceptions 3035
Has P perceptions 3035
Has M perceptions 3035
Apoyo Pedagógico en asignaturas 3035
Apoyo pedagógico personal 3035
Tutoría entre pares 3035
Hacer a la familia partícipe del proceso 3035
Apoyo psicóloga(o) 3035
Apoyo fonoaudióloga(o) 3035
Apoyo Educador(a) Diferencial 3035
Apoyo Kinesióloga(o) 3035
Apoyo Médico General 3035
Apoyo Terapeuta Ocupacional 3035
Control Neurólogo 3035
Apoyo Interdisciplinario 3035
Adecuación curricular de acceso 3035
Adecuación curricular de objetivos 3035


In [None]:
y_keys = list(strat_present.keys())
df = pd.DataFrame(data=new_dataset_to_export)
X = df
Y = df[y_keys]
strats_amounts = {
              'Adecuación curricular de acceso': 2264,
              'Hacer a la familia partícipe del proceso': 2048,
              'Apoyo Interdisciplinario': 1441, 
              'Apoyo Educador(a) Diferencial': 1311,
              'Apoyo pedagógico personal': 1240,
              'Apoyo fonoaudióloga(o)': 378,
              'Apoyo psicóloga(o)': 588,
              'Apoyo Terapeuta Ocupacional': 153,
              'Tutoría entre pares': 350,
              'Control Neurólogo': 63,
              'Apoyo Médico General': 64,
              'Apoyo Kinesióloga(o)': 32,
              'Adecuación curricular de objetivos': 281,
              'Apoyo Pedagógico en asignaturas': 1314
}
most_unbalanced_strategies = [strategy for strategy in y_keys if (strats_amounts[strategy] < len(X)*0.15 or strats_amounts[strategy] > len(X)*0.85)]
less_unbalanced_strategies = [strategy for strategy in y_keys if strategy not in most_unbalanced_strategies]

In [None]:
def iterative_stratification(X, y_labels, train_d, val_d, test_d):
  train_partition = pd.DataFrame(columns=X.columns)
  val_partition = pd.DataFrame(columns=X.columns)
  test_partition = pd.DataFrame(columns=X.columns)

  train_examples = X.shape[0]*train_d
  val_examples = X.shape[0]*val_d
  test_examples = X.shape[0]*test_d

  train_examples_per_label = {}
  val_examples_per_label = {}
  test_examples_per_label = {}
  for label in y_labels:
    number_of_occurrences = X[label].value_counts()[1]
    train_examples_per_label[label] = number_of_occurrences*train_d
    val_examples_per_label[label] = number_of_occurrences*val_d
    test_examples_per_label[label] = number_of_occurrences*test_d

  while X.shape[0] > 0:
    print(X.shape[0])

    min_occurs = 999999999
    min_label = None
    for label in y_labels:
      if 1 in X[label].value_counts():
        occs = X[label].value_counts()[1]
        if occs < min_occurs:
          min_occurs = occs
          min_label = label
    min_label_dataset = X[X[min_label]==1]
    print(min_label)

    for index, row in min_label_dataset.iterrows():
      if ((train_examples_per_label[min_label] > val_examples_per_label[min_label] 
          and train_examples_per_label[min_label] > test_examples_per_label[min_label]) or
          (train_examples_per_label[min_label] > val_examples_per_label[min_label] 
          and train_examples_per_label[min_label] == test_examples_per_label[min_label]) or
          (train_examples_per_label[min_label] == val_examples_per_label[min_label] 
          and train_examples_per_label[min_label] > test_examples_per_label[min_label])):
        train_partition = train_partition.append(row, ignore_index=True)
        X = X.drop(index)
        for label in y_labels:
          if row[label] == 1:
            train_examples_per_label[label] = train_examples_per_label[label] - 1
        train_examples -= 1

      elif ((val_examples_per_label[min_label] > train_examples_per_label[min_label] 
          and val_examples_per_label[min_label] > test_examples_per_label[min_label]) or
          (val_examples_per_label[min_label] > train_examples_per_label[min_label] 
          and val_examples_per_label[min_label] == test_examples_per_label[min_label])):
        val_partition = val_partition.append(row, ignore_index=True)
        X = X.drop(index)
        for label in y_labels:
          if row[label] == 1:
            val_examples_per_label[label] = val_examples_per_label[label] - 1
        val_examples -= 1

      else:
        test_partition = test_partition.append(row, ignore_index=True)
        X = X.drop(index)
        for label in y_labels:
          if row[label] == 1:
            test_examples_per_label[label] = test_examples_per_label[label] - 1
        test_examples -= 1

  return train_partition, val_partition, test_partition

In [None]:
train, val, test = iterative_stratification(X, y_keys, 0.6, 0.2, 0.2)

3035
Apoyo Kinesióloga(o)
3003
Control Neurólogo
2942
Apoyo Médico General
2883
Apoyo Terapeuta Ocupacional
2757
Adecuación curricular de objetivos
2494
Tutoría entre pares
2214
Apoyo fonoaudióloga(o)
1914
Apoyo psicóloga(o)
1516
Apoyo pedagógico personal
869
Apoyo Educador(a) Diferencial
491
Apoyo Pedagógico en asignaturas
336
Apoyo Interdisciplinario
188
Hacer a la familia partícipe del proceso
23
Adecuación curricular de acceso


In [None]:
train.shape

(1836, 29)

In [None]:
train.head()

Unnamed: 0,Encoded Diagnosis,Diagnosis,All perceptions,Special Education Teacher Perceptions,Speech Therapist Perceptions,Psychologist Perceptions,Medical Perceptions,Amount of SET perceptions,Amount of ST perceptions,Amount of P perceptions,Amount of M perceptions,Has SET perceptions,Has ST perceptions,Has P perceptions,Has M perceptions,Apoyo Pedagógico en asignaturas,Apoyo pedagógico personal,Tutoría entre pares,Hacer a la familia partícipe del proceso,Apoyo psicóloga(o),Apoyo fonoaudióloga(o),Apoyo Educador(a) Diferencial,Apoyo Kinesióloga(o),Apoyo Médico General,Apoyo Terapeuta Ocupacional,Control Neurólogo,Apoyo Interdisciplinario,Adecuación curricular de acceso,Adecuación curricular de objetivos
0,3,Discapacidad intelectual,"En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...","En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...",,-Establece relaciones sociales principalmente ...,Estudiante con atenciones medicas debido a su ...,1,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0
1,3,Discapacidad intelectual,"En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...","En cuanto a Lenguaje y Comunicacion, [ESTUDIAN...",,-Establece relaciones sociales principalmente ...,-Estudiante con atenciones medicas debido a su...,1,0,1,1,1,0,1,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0
2,3,Discapacidad intelectual,"Habilidades (Cognitivas, comunicativas, social...","Habilidades (Cognitivas, comunicativas, social...",,Comunica sus deseos y emociones de manera mas ...,Controles periodicos al dia. Equipo multidisci...,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,1,1,0
3,3,Discapacidad intelectual,"scar, no presenta dificultades en el desarroll...","scar, no presenta dificultades en el desarroll...",,"[ESTUDIANTE], es un niño entusiasta y colabora...","Estudiante con un estado sano de salud, pero c...",1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,1,0,0,1,1,1,0
4,9,Trastorno motor,En [ESTUDIANTE] se evidencia preferencia por e...,En [ESTUDIANTE] se evidencia preferencia por e...,,Estudiante cariñoso y respetuoso con sus compa...,,1,0,2,0,1,0,1,0,1,0,0,0,0,0,1,1,0,1,0,1,0,0


In [None]:
test.shape

(592, 29)

In [None]:
val.shape

(607, 29)

In [None]:
train.to_csv('train_ds.csv', index=False)

In [None]:
test.to_csv('test_ds.csv', index=False)

In [None]:
val.to_csv('val_ds.csv', index=False)

In [None]:
for strat in y_keys:
  print(strat)
  print(train[strat].value_counts()[1], val[strat].value_counts()[1], test[strat].value_counts()[1])

Apoyo Pedagógico en asignaturas
788 263 263
Apoyo pedagógico personal
744 248 248
Tutoría entre pares
210 70 70
Hacer a la familia partícipe del proceso
1229 410 409
Apoyo psicóloga(o)
353 118 117
Apoyo fonoaudióloga(o)
227 76 75
Apoyo Educador(a) Diferencial
786 262 262
Apoyo Kinesióloga(o)
19 7 6
Apoyo Médico General
38 13 12
Apoyo Terapeuta Ocupacional
91 31 30
Control Neurólogo
38 13 12
Apoyo Interdisciplinario
865 288 288
Adecuación curricular de acceso
1333 474 455
Adecuación curricular de objetivos
169 56 56
