### **Definir el directorio de trabajo con Google Colab**
Define el directorio de trabajo como la carpeta `data` de la carpeta compartida `DS4A-Team12` de Drive.

In [None]:
import os
import sys
from google.colab import drive 
# Enlazar a la carpeta 'data'
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/ICBF/data')
sys.path.insert(0, '../scripts/0_utils')
!pwd

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/ICBF/data


In [None]:
# Librerias relevantes 
import pandas                  as pd
import numpy                   as np
import json
import time
import matplotlib.pyplot       as plt
import seaborn                 as sns
import statsmodels.api         as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
import os
from scipy import stats, special
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, 
                             confusion_matrix, 
                             plot_confusion_matrix,
                             f1_score, 
                             precision_score, 
                             recall_score)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree
from sklearn import metrics
from sklearn import neighbors
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier)
from matplotlib import pyplot
import random
%matplotlib inline
plt.style.use('ggplot')
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier)
import joblib 
# Opciones de visualización
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

  import pandas.util.testing as tm


### **Abrir base de datos unidos**
Abre el archivo `Sociodemo_pre.parquet` (5945349 registros, 148 MB) como un dataframe de nombre `soc`, con toda la base de datos, preprocesada para asignarle a cada variable el tipo de dato correspondiente.

Abre también el diccionario del archivo `sociodemo_datatypes_low_memo.json` que permite asignar a las variables tipos de datos para un uso más eficiente de la memoria (641 MB).

In [None]:


def print_score(label, prediction, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, prediction)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
        print("_______________________________________________")
        print(f"Classification Report:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(label, prediction)}\n") 

In [None]:
time0 = time.time()
# Abre el diccionario con la estructura de datos definida
his = list()
# Abre el dataframe
for i in range(3):
  tom = pd.read_parquet(f'historical_data/historia_{i + 1}_train.parquet')
  cols_edad = [col for col in tom.columns if col[:4] == 'Edad']
  cols_medidas = [col for col in tom.columns if col[:4] == 'Peso' or col[:5] == 'Talla']
  cols_zscore = [col for col in tom.columns if col[:6] == 'ZScore']
  cols_estado = [col for col in tom.columns if col[:6] == 'Estado']
  cols_tom = ['IdBeneficiario', 'desnutricion_previa', 'tiempo', 'Desnutricion'] + \
    cols_edad + cols_medidas + cols_zscore
  his.append(tom[cols_tom])
#his[2].info()

soc = pd.read_parquet(f'clean_data/sociodemo_models.parquet')
soc = soc.drop(columns=['cod_mpio', 'Id'])


In [None]:
his[1].columns

Index(['IdBeneficiario', 'desnutricion_previa', 'tiempo', 'Desnutricion',
       'EdadMeses-2', 'EdadMeses-1', 'EdadMeses-0', 'Peso-2', 'Talla-2',
       'Peso-1', 'Talla-1', 'ZScoreTallaEdad-2', 'ZScorePesoEdad-2',
       'ZScorePesoTalla-2', 'ZScoreIMC-2', 'ZScoreTallaEdad-1',
       'ZScorePesoEdad-1', 'ZScorePesoTalla-1', 'ZScoreIMC-1'],
      dtype='object', name='varname')

In [None]:
modelos = pd.DataFrame([[tiempo, tomas, sociodemo] for tiempo in range(1, 5) 
  for tomas in range(1, 4)
  for sociodemo in range(4)], 
  columns=['tiempo', 'tomas', 'sociodemo'])
modelos[20:30]

Unnamed: 0,tiempo,tomas,sociodemo
20,2,3,0
21,2,3,1
22,2,3,2
23,2,3,3
24,3,1,0
25,3,1,1
26,3,1,2
27,3,1,3
28,3,2,0
29,3,2,1


In [None]:
cols_dpto = [col for col in soc.columns if col[:8] == 'cod_dpto']
cols_top = ['Ind_estudia',
            'Edad_padres_menor',
            'ingresos_promP_imp',
            'Edad_padres_mayor',
            'Tip_cuidado_ninos_2',
            'gasto_ppers_imp',
            'porc_gasto_alim',
            'n_privaciones',
            'Ind_nivel_sisben_4',
            'gasto_alim_ppers_imp',
            'Tip_cuidado_ninos_1',
            'n_personas_hogar',
            'Ind_ninguna_discapac',
            'Nivel_educ_madre',
            'Nivel_educ_padre',
            'n_afec_evento_natural',
            'Uni_dias_agua',
            'n_ninos']

In [None]:
soc_df = list()
# Todas las variables
soc_df.append(soc)
# Excluyendo la información geográfica
soc = soc.drop(columns=cols_dpto)
soc_df.append(soc)
# Tomando solo las variables importantes
soc = soc[['IdBeneficiario'] + cols_top]
soc_df.append(soc)

In [None]:
parameters = {'max_depth': [2 * i for i in range(1, 10)], 
              'min_samples_leaf':range(1, 5),
              'n_estimators':[50, 100, 150]}
def train_test_rf(df, seed=1, test_size=0.3, parameters=parameters, verbose=1):
  df = df.dropna()
  idB = list(df['IdBeneficiario'].unique())
  train_ids, test_ids = train_test_split(idB, test_size = test_size, random_state=seed)
  df_train_unbalanced = df[df['IdBeneficiario'].isin(train_ids)]
  df_train_positive = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 1]
  df_train_negative = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 0]
  df_train_negative_balanced = df_train_negative.sample(n = len(df_train_positive), random_state=seed)
  df_train = pd.concat([df_train_positive, df_train_negative_balanced]).sample(frac=1, random_state=seed)
  df_test = df[df['IdBeneficiario'].isin(test_ids)]
  X_train = df_train.drop(columns=['Desnutricion', 'IdBeneficiario'])
  X_test = df_test.drop(columns=['Desnutricion', 'IdBeneficiario'])
  y_train = df_train['Desnutricion']
  y_test = df_test['Desnutricion']
 
  classifier = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=seed), 
                              parameters, n_jobs=10, cv=5, verbose=verbose) #5 kfolds
  classifier.fit(X=X_train, y=y_train)
  model = classifier.best_estimator_
  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
 
  train_accuracy = accuracy_score(y_train, y_pred_train)
  precision_train = precision_score(y_train, y_pred_train)
  recall_train = recall_score(y_train, y_pred_train)
 
  test_accuracy = accuracy_score(y_test, y_pred_test)
  precision_test = precision_score(y_test, y_pred_test)
  recall_test = recall_score(y_test, y_pred_test)
 
  importance = pd.Series({X_train.columns[i]: model.feature_importances_[i] 
                          for i in range(len(model.feature_importances_))})
  metrics = pd.DataFrame({
      'random_state': seed,
      'accuracy_train': accuracy_score(y_train, y_pred_train),
      'precision_train': precision_score(y_train, y_pred_train),
      'recall_train': recall_score(y_train, y_pred_train),
      'accuracy_test': accuracy_score(y_test, y_pred_test),
      'precision_test': precision_score(y_test, y_pred_test),
      'recall_test': recall_score(y_test, y_pred_test),
      'max_depth': classifier.best_params_['max_depth'],
      'n_trees': classifier.best_params_['n_estimators']
      }, index=[seed])
  return (model, metrics, importance.sort_values(ascending=False))

In [None]:
metrics_list = list()
importance_list = list()

In [None]:
t0 = time.time()
for i, model in modelos[11:].iterrows():
  df = his[model['tomas'] - 1]
  df = df[df['tiempo'] == model['tiempo']]
  if model['sociodemo'] > 0:
    df = df.merge(soc_df[model['sociodemo'] - 1], on='IdBeneficiario')
  training = train_test_rf(df=df, seed=i, test_size=0.3, parameters=parameters, verbose=0)
  features = pd.DataFrame({'tiempo': model['tiempo'],
                           'tomas': model['tomas'],
                           'sociodemo': model['sociodemo'],
                           'registros': df.shape[0],
                           'variables': df.shape[1]
                           }, index = [i])
  print(f"\nModelo: {df.shape[1]} variables, incluyendo {model['tomas']} tomas anteriores, " + 
        f"para  {model['tiempo']} meses. Tiempo: {time.time() - t0:.2f} s.")
  model_details = pd.concat([training[1], features], axis=1)
  display(model_details)
  joblib.dump(training[0], f'models/random_forest_history/model_{i}.pickle')
  metrics_list.append(model_details)
  importance_list.append(training[2])
  if i % 5 == 0:
    pd.concat(metrics_list).to_parquet('models/random_forest_history_metrics.parquet')
    pd.concat(importance_list, axis=1).transpose().\
      to_parquet('models/random_forest_history_importance.parquet')
 
metrics = pd.concat(metrics_list)
importance = pd.concat(importance_list, axis=1).transpose()


Modelo: 44 variables, incluyendo 3 tomas anteriores, para  1 meses. Tiempo: 110.94 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
11,11,0.985714,0.981132,0.990476,0.854942,0.341538,0.902439,14,100,1,3,3,5626,44



Modelo: 12 variables, incluyendo 1 tomas anteriores, para  2 meses. Tiempo: 426.92 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
12,12,0.808646,0.768343,0.883742,0.714951,0.200333,0.84778,6,150,2,1,0,60898,12



Modelo: 139 variables, incluyendo 1 tomas anteriores, para  2 meses. Tiempo: 645.78 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
13,13,0.966687,0.97489,0.958051,0.743729,0.198758,0.833876,18,100,2,1,1,29041,139



Modelo: 106 variables, incluyendo 1 tomas anteriores, para  2 meses. Tiempo: 847.04 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
14,14,0.947537,0.944127,0.951376,0.736078,0.2156,0.872024,14,150,2,1,2,29041,106



Modelo: 30 variables, incluyendo 1 tomas anteriores, para  2 meses. Tiempo: 1031.52 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
15,15,0.896263,0.880569,0.916881,0.740683,0.211278,0.800878,14,100,2,1,3,29041,30



Modelo: 19 variables, incluyendo 2 tomas anteriores, para  2 meses. Tiempo: 1221.99 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
16,16,0.844219,0.858503,0.824298,0.836439,0.214873,0.80146,6,100,2,2,0,45432,19



Modelo: 146 variables, incluyendo 2 tomas anteriores, para  2 meses. Tiempo: 1356.45 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
17,17,0.92011,0.934473,0.903581,0.827913,0.196187,0.76,8,150,2,2,1,22009,146



Modelo: 113 variables, incluyendo 2 tomas anteriores, para  2 meses. Tiempo: 1489.86 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
18,18,0.883673,0.900568,0.862585,0.826699,0.2014,0.81962,8,50,2,2,2,22009,113



Modelo: 37 variables, incluyendo 2 tomas anteriores, para  2 meses. Tiempo: 1609.09 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
19,19,0.828169,0.844675,0.804225,0.84172,0.220844,0.782991,4,100,2,2,3,22009,37



Modelo: 26 variables, incluyendo 3 tomas anteriores, para  2 meses. Tiempo: 1760.63 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
20,20,0.873052,0.889847,0.851512,0.849488,0.199455,0.778723,6,50,2,3,0,37183,26



Modelo: 153 variables, incluyendo 3 tomas anteriores, para  2 meses. Tiempo: 1880.25 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
21,21,0.844796,0.845438,0.843866,0.840492,0.208333,0.843621,4,50,2,3,1,18116,153



Modelo: 120 variables, incluyendo 3 tomas anteriores, para  2 meses. Tiempo: 2001.50 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
22,22,0.885586,0.899254,0.868468,0.836703,0.185223,0.809735,6,100,2,3,2,18116,120



Modelo: 44 variables, incluyendo 3 tomas anteriores, para  2 meses. Tiempo: 2112.30 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
23,23,0.963138,0.980392,0.94518,0.85642,0.220046,0.757937,10,150,2,3,3,18116,44



Modelo: 12 variables, incluyendo 1 tomas anteriores, para  3 meses. Tiempo: 2655.02 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
24,24,0.808359,0.794762,0.831422,0.773156,0.199774,0.835891,6,150,3,1,0,180654,12



Modelo: 139 variables, incluyendo 1 tomas anteriores, para  3 meses. Tiempo: 3036.41 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
25,25,0.950584,0.944018,0.957977,0.775278,0.202507,0.841428,18,150,3,1,1,90073,139



Modelo: 106 variables, incluyendo 1 tomas anteriores, para  3 meses. Tiempo: 3418.33 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
26,26,0.857597,0.839075,0.884909,0.774805,0.193875,0.835995,10,150,3,1,2,90073,106



Modelo: 30 variables, incluyendo 1 tomas anteriores, para  3 meses. Tiempo: 3782.77 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
27,27,0.81665,0.799465,0.845342,0.76905,0.19784,0.862336,6,50,3,1,3,90073,30



Modelo: 19 variables, incluyendo 2 tomas anteriores, para  3 meses. Tiempo: 4245.64 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
28,28,0.816344,0.831755,0.793118,0.830212,0.182688,0.785042,6,150,3,2,0,153459,19



Modelo: 146 variables, incluyendo 2 tomas anteriores, para  3 meses. Tiempo: 4530.75 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
29,29,1.0,1.0,1.0,0.821967,0.183032,0.819672,18,150,3,2,1,77153,146



Modelo: 113 variables, incluyendo 2 tomas anteriores, para  3 meses. Tiempo: 4808.87 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
30,30,0.95222,0.95337,0.950951,0.826414,0.183024,0.798457,12,150,3,2,2,77153,113



Modelo: 37 variables, incluyendo 2 tomas anteriores, para  3 meses. Tiempo: 5079.79 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
31,31,0.835602,0.849592,0.815594,0.832363,0.181331,0.796524,6,50,3,2,3,77153,37



Modelo: 26 variables, incluyendo 3 tomas anteriores, para  3 meses. Tiempo: 5416.66 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
32,32,0.835894,0.860653,0.801569,0.838324,0.157047,0.788069,6,100,3,3,0,122992,26



Modelo: 153 variables, incluyendo 3 tomas anteriores, para  3 meses. Tiempo: 5647.25 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
33,33,0.951892,0.965017,0.937781,0.836883,0.16875,0.819364,12,50,3,3,1,62401,153



Modelo: 120 variables, incluyendo 3 tomas anteriores, para  3 meses. Tiempo: 5877.87 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
34,34,0.860825,0.87037,0.847938,0.835805,0.164251,0.778255,6,150,3,3,2,62401,120



Modelo: 44 variables, incluyendo 3 tomas anteriores, para  3 meses. Tiempo: 6114.00 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
35,35,0.862025,0.876812,0.842405,0.835421,0.156213,0.786885,6,100,3,3,3,62401,44



Modelo: 12 variables, incluyendo 1 tomas anteriores, para  4 meses. Tiempo: 6287.25 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
36,36,0.784689,0.78266,0.788278,0.765583,0.169568,0.732591,6,150,4,1,0,40311,12



Modelo: 139 variables, incluyendo 1 tomas anteriores, para  4 meses. Tiempo: 6430.15 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
37,37,0.911908,0.921642,0.900365,0.757911,0.159696,0.773006,12,50,4,1,1,19695,139



Modelo: 106 variables, incluyendo 1 tomas anteriores, para  4 meses. Tiempo: 6572.51 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
38,38,0.861783,0.855,0.871338,0.756245,0.172177,0.758242,8,50,4,1,2,19695,106



Modelo: 30 variables, incluyendo 1 tomas anteriores, para  4 meses. Tiempo: 6698.81 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
39,39,0.868655,0.874839,0.860406,0.779753,0.186676,0.745152,8,100,4,1,3,19695,30



Modelo: 19 variables, incluyendo 2 tomas anteriores, para  4 meses. Tiempo: 6865.95 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
40,40,0.784072,0.808929,0.743842,0.785842,0.148722,0.730453,4,50,4,2,0,34891,19



Modelo: 146 variables, incluyendo 2 tomas anteriores, para  4 meses. Tiempo: 6994.51 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
41,41,0.89849,0.924866,0.86745,0.816376,0.168651,0.688259,10,150,4,2,1,17076,146



Modelo: 113 variables, incluyendo 2 tomas anteriores, para  4 meses. Tiempo: 7117.65 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
42,42,0.983156,0.98574,0.980496,0.785729,0.170523,0.713262,16,100,4,2,2,17076,113



Modelo: 37 variables, incluyendo 2 tomas anteriores, para  4 meses. Tiempo: 7226.93 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
43,43,0.801085,0.818356,0.77396,0.793896,0.188093,0.751724,4,50,4,2,3,17076,37



Modelo: 26 variables, incluyendo 3 tomas anteriores, para  4 meses. Tiempo: 7354.38 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
44,44,0.780788,0.801587,0.746305,0.784414,0.141345,0.789941,4,150,4,3,0,27567,26



Modelo: 153 variables, incluyendo 3 tomas anteriores, para  4 meses. Tiempo: 7465.29 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
45,45,0.935366,0.951899,0.917073,0.797998,0.143705,0.647059,10,150,4,3,1,13626,153



Modelo: 120 variables, incluyendo 3 tomas anteriores, para  4 meses. Tiempo: 7572.62 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
46,46,0.861538,0.883152,0.833333,0.78819,0.173362,0.792271,6,150,4,3,2,13626,120



Modelo: 44 variables, incluyendo 3 tomas anteriores, para  4 meses. Tiempo: 7674.47 s.


Unnamed: 0,random_state,accuracy_train,precision_train,recall_train,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
47,47,0.803991,0.831202,0.762911,0.802611,0.151972,0.766082,2,150,4,3,3,13626,44


In [None]:

metrics = pd.concat(metrics_list)
importance = pd.concat(importance_list, axis=1).transpose()

In [None]:
metrics.to_parquet('models/random_forest_history_metrics.parquet')
importance.to_parquet('models/random_forest_history_importance.parquet')

In [None]:
metrics[metrics['tiempo']==1][metrics.columns[4:]]

Unnamed: 0,accuracy_test,precision_test,recall_test,max_depth,n_trees,tiempo,tomas,sociodemo,registros,variables
0,0.734197,0.279882,0.855639,6,100,1,1,0,20476,12
1,0.748471,0.25498,0.774194,12,100,1,1,1,8301,139
2,0.781208,0.302469,0.841202,12,50,1,1,2,8301,106
3,0.697547,0.238202,0.851406,4,100,1,1,3,8301,30
4,0.824914,0.293587,0.827684,10,50,1,2,0,15862,19
5,0.867517,0.37386,0.788462,10,100,1,2,1,6471,146
6,0.856907,0.299419,0.844262,14,50,1,2,2,6471,113
7,0.828402,0.299287,0.84,6,50,1,2,3,6471,37
8,0.830464,0.302817,0.848684,16,150,1,3,0,13588,26
9,0.824882,0.273556,0.818182,2,150,1,3,1,5626,153


In [None]:
importance.mean().sort_values(ascending=False)[:100]

ZScorePesoTalla-1            0.223570
ZScoreIMC-1                  0.184039
ZScorePesoEdad-1             0.091635
ZScorePesoTalla-2            0.083898
ZScoreIMC-2                  0.069314
ZScorePesoEdad-2             0.050367
ZScorePesoTalla-3            0.048083
ZScorePesoEdad-3             0.043870
ZScoreIMC-3                  0.037566
Peso-1                       0.030234
ZScoreTallaEdad-1            0.026507
Talla-1                      0.020613
EdadMeses-0                  0.018367
EdadMeses-1                  0.018024
ZScoreTallaEdad-2            0.017066
EdadMeses-2                  0.014745
EdadMeses-3                  0.014661
ZScoreTallaEdad-3            0.014076
Peso-2                       0.013363
Talla-2                      0.013295
gasto_ppers_imp              0.012918
ingresos_promP_imp           0.012728
Edad_padres_mayor            0.011746
Edad_padres_menor            0.011556
porc_gasto_alim              0.011277
n_privaciones                0.011004
Talla-3     

Unnamed: 0_level_0,accuracy_test,recall_test
tiempo,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.867517,0.902439
2,0.85642,0.872024
3,0.838324,0.862336
4,0.816376,0.792271


In [None]:
t0 = time.time()
for i, model in modelos[11:].iterrows():
  df = his[model['tomas'] - 1]
  df = df[df['tiempo'] == model['tiempo']]
  training = train_test_rf(df=df, seed=i, test_size=0.3, parameters=parameters, verbose=0)
  features = pd.DataFrame({'tiempo': model['tiempo'],
                           'tomas': model['tomas'],
                           'sociodemo': model['sociodemo'],
                           'registros': df.shape[0],
                           'variables': df.shape[1]
                           }, index = [i])
  print(f"\nModelo: {df.shape[1]} variables, incluyendo {model['tomas']} tomas anteriores, " + 
        f"para  {model['tiempo']} meses. Tiempo: {time.time() - t0:.2f} s.")
  model_details = pd.concat([training[1], features], axis=1)
  display(model_details)
  joblib.dump(training[0], f'models/random_forest_history/model_{i}.pickle')
  metrics_list.append(model_details)
  importance_list.append(training[2])
  if i % 5 == 0:
    pd.concat(metrics_list).to_parquet('models/random_forest_history_metrics.parquet')
    pd.concat(importance_list, axis=1).transpose().\
      to_parquet('models/random_forest_history_importance.parquet')

In [None]:
sns.scatterplot(x=mod['tomas'], y=mod['accuracy_test'], hue=mod['sociodemo'])

In [None]:
cols_medidas = [col for col in df.columns if col[:8] == 'Medicion']#, 'Peso', 'Talla']
cols_zscore = [col for col in df.columns if col[:6] == 'ZScore']
cols_estado = [col for col in df.columns if col[:6] == 'Estado']
cols_zscore_peso = [col for col in df.columns if col[:10] == 'ZScorePeso']
cols_estado_peso = [col for col in df.columns if col[:10] == 'EstadoPeso']
cols_nan = ['Ind_acudio_salud',
            'Ind_fue_atendido_salud',
            'Ind_recibe_comida',
            'Ind_leer_escribir']

for col in cols_estado:
  df[col] = df[col].astype('category').\
    cat.set_categories(tom_cat[col[:-2]], ordered=True)
  df[col] = df[col].cat.codes
df = df.drop(columns=cols_medidas)
df = df[~df.isna()]

In [None]:
for col in cols_nan:
  df[col] = df[col].fillna(0)

for col in df.columns[:-2]:
  try:
    df[col] = df[col].fillna(df[col].mean())
  except:
    pass
df.info()
df.head()

In [None]:
df = df.query('FechaNacimiento > 2010')
df = df.drop(columns=['cod_mpio', 'cod_dpto', 'FechaNacimiento', 'Id',
                 'Ind_grupo_sisben_4'])

In [None]:
df = df.drop(columns=['EstadoPesoTalla-0'])

In [None]:
parameters = {'max_depth': [2 * i for i in range(1, 10)], 
              'min_samples_leaf':range(1,5),
              'n_estimators':[50, 100, 150]}

In [None]:
df = df[df['tiempo'] == 3]
idB = list(df['IdBeneficiario'].unique())
df['Desnutricion'] = df['Desnutricion'].astype('int')
df['Desnutricion'].value_counts(normalize=True)

In [None]:
df.head()

In [None]:
m = 2012
# Modelo con todas las variables

train_ids, test_ids = train_test_split(idB, test_size = 0.3, random_state=seed)
df_train_unbalanced = df[df['IdBeneficiario'].isin(train_ids)]
df_train_positive = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 1]
df_train_negative = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 0]
df_train_negative_balanced = df_train_negative.sample(n = len(df_train_positive), random_state=seed)
df_train = pd.concat([df_train_positive, df_train_negative_balanced]).sample(frac=1, random_state=seed)
df_test = df[df['IdBeneficiario'].isin(test_ids)]
X_train = df_train.drop(columns=['Desnutricion', 'IdBeneficiario'])
X_test = df_test.drop(columns=['Desnutricion'])
y_train = df_train['Desnutricion']
y_test = df_test['Desnutricion']

classifier = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=seed), 
                             parameters, n_jobs=10, cv=5, verbose=1) #5 kfolds
classifier.fit(X=X_train, y=y_train)
model = classifier.best_estimator_
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)

test_accuracy = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)

importance = pd.Series({X_train.columns[i]: model.feature_importances_[i] 
                        for i in range(len(model.feature_importances_))})
metrics_all = pd.DataFrame({
    'model': 'all',
    'accuracy_train': accuracy_score(y_train, y_pred_train),
    'precision_train': precision_score(y_train, y_pred_train),
    'recall_train': recall_score(y_train, y_pred_train),
    'accuracy_test': accuracy_score(y_test, y_pred_test),
    'precision_test': precision_score(y_test, y_pred_test),
    'recall_test': recall_score(y_test, y_pred_test),
    'max_depth': classifier.best_params_['max_depth'],
    'n_trees': classifier.best_params_['n_estimators']
    }, index=[m])
print('Most important variables:')
print(importance.sort_values(ascending=False)[:10])
display(metrics_all)

In [None]:
y_test.value_counts()

In [None]:
df = pd.read_parquet('historical_data/historia_3_train.parquet')
df.columns

In [None]:
df = df.drop(columns=[col for col in df.columns if col[:5] in ['Fecha', 'Estad', 'Prese', 'Contr', 'Codig',
                                                               'Medic']]).\
 dropna()
df = df[df['tiempo'] == 3]
df.info()

In [None]:
m = 2012
# Modelo con todas las variables

train_ids, test_ids = train_test_split(idB, test_size = 0.3, random_state=seed)
df_train_unbalanced = df[df['IdBeneficiario'].isin(train_ids)]
df_train_positive = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 1]
df_train_negative = df_train_unbalanced[df_train_unbalanced['Desnutricion'] == 0]
df_train_negative_balanced = df_train_negative.sample(n = len(df_train_positive), random_state=seed)
df_train = pd.concat([df_train_positive, df_train_negative_balanced]).sample(frac=1, random_state=seed)
df_test = df[df['IdBeneficiario'].isin(test_ids)]
X_train = df_train.drop(columns=['Desnutricion', 'IdBeneficiario'])
X_test = df_test.drop(columns=['Desnutricion', 'IdBeneficiario'])
y_train = df_train['Desnutricion']
y_test = df_test['Desnutricion']

classifier = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=seed), 
                             parameters, n_jobs=10, cv=5, verbose=1) #5 kfolds
classifier.fit(X=X_train, y=y_train)
model = classifier.best_estimator_
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)

test_accuracy = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)

importance = pd.Series({X_train.columns[i]: model.feature_importances_[i] 
                        for i in range(len(model.feature_importances_))})
metrics_all = pd.DataFrame({
    'model': 'all',
    'accuracy_train': accuracy_score(y_train, y_pred_train),
    'precision_train': precision_score(y_train, y_pred_train),
    'recall_train': recall_score(y_train, y_pred_train),
    'accuracy_test': accuracy_score(y_test, y_pred_test),
    'precision_test': precision_score(y_test, y_pred_test),
    'recall_test': recall_score(y_test, y_pred_test),
    'max_depth': classifier.best_params_['max_depth'],
    'n_trees': classifier.best_params_['n_estimators']
    }, index=[m])
print('Most important variables:')
print(importance.sort_values(ascending=False))
display(metrics_all)

In [None]:
# Submuestreo de la clase minoritaria
df3_Desnutricion = df3.loc[df3['Desnutricion']]
df3_NoDesnutricion = df3.loc[~df3['Desnutricion']].sample(n=df3['Desnutricion'].sum(), random_state=12)
dfXy = pd.concat([df3_Desnutricion, df3_NoDesnutricion]).sample(frac=1, random_state=12)
dfXy['Desnutricion'] = dfXy['Desnutricion'].astype('float')
dfXy.head()

In [None]:
X = dfXy.drop(columns=['FechaNacimiento', 'Ind_grupo_sisben_4', 'IdBeneficiario', 'EstadoPesoTalla-0', 'CodigoPesoTalla', 'Desnutricion'])
y = dfXy[['Desnutricion']]
print(f"""X shape {X.shape}
y shape {y.shape}""")

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, shuffle=True, random_state=1)

print(f"""X_train shape {X_train.shape}
y_train shape {y_train.shape}
X_test shape {X_test.shape}
y_test shape {y_test.shape}""")

In [None]:
y_train.value_counts()

In [None]:
parameters = {'max_depth': [2 * i for i in range(1, 10)], 
              'min_samples_leaf':range(1,5), 'n_estimators':[50, 100, 150]}
classifierRF1 = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=12), 
                             parameters, n_jobs=6, cv=5, verbose=1) #5 kfolds
classifierRF1.fit(X=X_train, y=y_train)
clfRF_model1 = classifierRF1.best_estimator_
print('Best classifierRF ', classifierRF1.best_score_, classifierRF1.best_params_)

***Predicción RF All Features  - Unbalanced dataset***

In [None]:
pred_clf_rf=clfRF_model1.predict(X_test)
pred_clf_RF_test = clfRF_model1.predict(X_test)
pred_clf_RF_train = clfRF_model1.predict(X_train)
plot_confusion_matrix(clfRF_model1, X_test, y_test, cmap='coolwarm');

In [None]:
best_model_RF = clfRF_model1 
# get importance
importance = pd.Series(best_model_RF.feature_importances_).reset_index().\
  sort_values(by=0, ascending=False)
# summarize feature importance
for i, row in importance[:10].iterrows():
  print(f'Score: {row[0]:.5%}, Feature: {X.columns[row["index"]]}')
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance[0])
pyplot.show()

In [None]:
target_names = ['No_desnutricion', 'Desnutricion']


print('_'*20,'pred_clf_rf','_'*20)
print(classification_report(y_test, pred_clf_rf, target_names=target_names))
print('_'*60,'\n')

score_result = dict()
print_score(y_train, pred_clf_RF_train, train=True)
print_score(y_test, pred_clf_RF_test, train=False)

score_result['RF'] = {
        'Train': accuracy_score(y_train,pred_clf_RF_train),
        'Test': accuracy_score(y_test, pred_clf_RF_test)}
print(score_result)

###Los niños del train y del test son distintos

In [None]:
# train test split
from sklearn.model_selection import train_test_split
id_train, id_test = \
    train_test_split(dfXy['IdBeneficiario'], test_size=0.3, shuffle=True, random_state=12)

df_train = dfXy[dfXy['IdBeneficiario'].isin(id_train)].\
  drop(columns=['IdBeneficiario', 'EstadoPesoTalla-0', 'CodigoPesoTalla'])
df_test = dfXy[dfXy['IdBeneficiario'].isin(id_test)].\
  drop(columns=['IdBeneficiario', 'EstadoPesoTalla-0', 'CodigoPesoTalla'])

X_train = df_train.drop(columns=['Desnutricion'])
X_test = df_test.drop(columns=['Desnutricion'])

y_train = df_train[['Desnutricion']]
y_test = df_test[['Desnutricion']]
print(f"""X shape {X.shape}
y shape {y.shape}""")
print(f"""X_train shape {X_train.shape}
y_train shape {y_train.shape}
X_test shape {X_test.shape}
y_test shape {y_test.shape}""")

y_train.value_counts()

In [None]:
parameters = {'max_depth': [2 * i for i in range(1, 10)], 
              'min_samples_leaf':range(1,5), 'n_estimators':[50, 100, 150]}
classifierRF2 = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=12), 
                             parameters, n_jobs=6, cv=5, verbose=1) #5 kfolds
classifierRF2.fit(X=X_train, y=y_train)
clfRF_model2 = classifierRF1.best_estimator_
print('Best classifierRF ', classifierRF2.best_score_, classifierRF2.best_params_)

In [None]:
pred_clf_rf=clfRF_model2.predict(X_test)
pred_clf_RF_test = clfRF_model2.predict(X_test)
pred_clf_RF_train = clfRF_model2.predict(X_train)
plot_confusion_matrix(clfRF_model2, X_test, y_test, cmap='coolwarm');

In [None]:
target_names = ['No_desnutricion', 'Desnutricion']

print('_'*20,'pred_clf_rf','_'*20)
print(classification_report(y_test, pred_clf_rf, target_names=target_names))
print('_'*60,'\n')

score_result = dict()
print_score(y_train, pred_clf_RF_train, train=True)
print_score(y_test, pred_clf_RF_test, train=False)

score_result['RF'] = {
        'Train': accuracy_score(y_train,pred_clf_RF_train),
        'Test': accuracy_score(y_test, pred_clf_RF_test)}
print(score_result)

In [None]:
best_model_RF = clfRF_model2
# get importance
importance = pd.Series(best_model_RF.feature_importances_).reset_index().\
  sort_values(by=0, ascending=False)
# summarize feature importance
for i, row in importance[:10].iterrows():
  print(f'Score: {row[0]:.5%}, Feature: {X.columns[row["index"]]}')
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance[0])
pyplot.show()

### Modelo minimo (Z-scores PesoTalla e IMC)

In [None]:
keep

In [None]:
# train test split
keep = [col for col in dfXy.columns if col[:-2] in ('ZScorePesoTalla', 'ZScoreIMC', 'EdadMeses')]
X3 = X[keep]
X3_train = X_train[keep]
X3_test = X_test[keep]

print(f"""X shape {X3.shape}
y shape {y.shape}""")
print(f"""X_train shape {X3_train.shape}
y_train shape {y_train.shape}
X_test shape {X3_test.shape}
y_test shape {y_test.shape}""")

y_train.value_counts()

In [None]:
parameters = {'max_depth': [2 * i for i in range(1, 10)], 
              'min_samples_leaf':range(1,5), 'n_estimators':[50, 100, 150]}
classifierRF3 = GridSearchCV(RandomForestClassifier(criterion='gini', random_state=12), 
                             parameters, n_jobs=6, cv=5, verbose=1) #5 kfolds
classifierRF3.fit(X=X3_train, y=y_train)
clfRF_model3 = classifierRF3.best_estimator_
print('Best classifierRF ', classifierRF3.best_score_, classifierRF3.best_params_)

In [None]:
pred_clf_rf=clfRF_model3.predict(X3_test)
pred_clf_RF_test = clfRF_model3.predict(X3_test)
pred_clf_RF_train = clfRF_model3.predict(X3_train)
plot_confusion_matrix(clfRF_model3, X3_test, y_test, cmap='coolwarm');

In [None]:
target_names = ['No_desnutricion', 'Desnutricion']

print('_'*20,'pred_clf_rf','_'*20)
print(classification_report(y_test, pred_clf_rf, target_names=target_names))
print('_'*60,'\n')

score_result = dict()
print_score(y_train, pred_clf_RF_train, train=True)
print_score(y_test, pred_clf_RF_test, train=False)

score_result['RF'] = {
        'Train': accuracy_score(y_train,pred_clf_RF_train),
        'Test': accuracy_score(y_test, pred_clf_RF_test)}
print(score_result)

In [None]:
best_model_RF = clfRF_model3 
# get importance
importance = pd.Series(best_model_RF.feature_importances_).reset_index().\
  sort_values(by=0, ascending=False)
# summarize feature importance
for i, row in importance[:10].iterrows():
  print(f'Score: {row[0]:.5%}, Feature: {X3.columns[row["index"]]}')
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance[0])
pyplot.show()