## ETAPA 4: MODELOS DE MACHINE LEARNING

In [90]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import plot_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV

# modelos lineales
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import HuberRegressor 
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV

# modelos de arboles
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
#!pip install xgboost
#pip install pydot
from xgboost import XGBRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import r2_score



### *Preparar Datos*

In [2]:
# Importar el dataset
df = pd.read_csv("../13 - Exports (preprocesamiento)/inmigrantes_merge.csv")

df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9360 entries, 0 to 9359
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             9360 non-null   int64  
 1   Nationality code                 9360 non-null   object 
 2   Sex                              9360 non-null   object 
 3   Age group                        9360 non-null   object 
 4   Immigrant count                  9360 non-null   int64  
 5   Liberal democracy index          9360 non-null   float64
 6   Continent                        9360 non-null   object 
 7   Sub-region                       9360 non-null   object 
 8   Health equality                  9360 non-null   float64
 9   Judicial accountability          9360 non-null   float64
 10  Public sector corrupt exchanges  9360 non-null   float64
 11  One-sided violence_deaths        9360 non-null   int64  
 12  Non-state_deaths    

Unnamed: 0,Year,Nationality code,Sex,Age group,Immigrant count,Liberal democracy index,Continent,Sub-region,Health equality,Judicial accountability,...,Non-state_deaths,Intrastate_deaths,Interstate_deaths,Number of residents,Political regime,Homicide Rate,Number of Turist,Spanish language,Restricciones_pandemia,Año post_pandemia
0,2008,DZA,Both,0 - 14,759,0.164,Africa,Africa,0.61,0.39,...,0,345,0,51922,3,0.95,44400000,0,0,0
1,2008,PER,Males,35 - 44,2938,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
2,2008,PER,Males,45 - 54,1128,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
3,2008,PER,Males,55 - 64,265,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
4,2008,PER,Males,65+,156,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,2022,PAK,Males,55 - 64,330,0.234,Asia,Asia,0.27,0.22,...,0,670,0,68821,6,4.21,59310000,0,0,1
9356,2022,PAK,Females,55 - 64,146,0.234,Asia,Asia,0.27,0.22,...,0,670,0,31675,6,4.21,59310000,0,0,1
9357,2022,PAK,Both,65+,169,0.234,Asia,Asia,0.27,0.22,...,0,670,0,100496,6,4.21,59310000,0,0,1
9358,2022,PAK,Males,65+,99,0.234,Asia,Asia,0.27,0.22,...,0,670,0,68821,6,4.21,59310000,0,0,1


Antes de proceder, debemos resolver la variable "Year" a una variable ordinal (los regímenes políticos ya están en formato ordinal) y el resto de variables categóricas a variables dummy.

En el caso de Year, simplemente restaremos 2007 a la columna entera, y para el resto de las variables objeto usaremos la funcion *.get_dummies()*.

In [3]:
# hacer copia del df
df_copy = df.copy()

# Transformar Year a variable ordinal de 1 (2008) a 15 (2022)
df_copy['Year'] = df_copy['Year'] - 2007

# Generar variables dummies a partir de nuestras variables categóricas "object" (no ordinales)
df_copy = pd.get_dummies(df_copy)

# Convertir las variables dummies booleanas en "int"
col_bool = df_copy.select_dtypes(include = ['bool']).columns
df_copy[col_bool] = df_copy[col_bool].astype(int)

# Verificar cambio
df_copy.info()
df_copy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9360 entries, 0 to 9359
Data columns (total 65 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Year                                      9360 non-null   int64  
 1   Immigrant count                           9360 non-null   int64  
 2   Liberal democracy index                   9360 non-null   float64
 3   Health equality                           9360 non-null   float64
 4   Judicial accountability                   9360 non-null   float64
 5   Public sector corrupt exchanges           9360 non-null   float64
 6   One-sided violence_deaths                 9360 non-null   int64  
 7   Non-state_deaths                          9360 non-null   int64  
 8   Intrastate_deaths                         9360 non-null   int64  
 9   Interstate_deaths                         9360 non-null   int64  
 10  Number of residents                 

Unnamed: 0,Year,Immigrant count,Liberal democracy index,Health equality,Judicial accountability,Public sector corrupt exchanges,One-sided violence_deaths,Non-state_deaths,Intrastate_deaths,Interstate_deaths,...,Continent_America,Continent_Asia,Continent_Europe,Sub-region_Africa,Sub-region_Asia,Sub-region_Central America and Caribbean,Sub-region_European Union,Sub-region_North America,Sub-region_Rest of Europe,Sub-region_South America
0,1,759,0.164,0.61,0.39,0.35,0,0,345,0,...,0,0,0,1,0,0,0,0,0,0
1,1,2938,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
2,1,1128,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
3,1,265,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
4,1,156,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,15,330,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9356,15,146,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9357,15,169,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9358,15,99,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0


In [6]:
# hacer copia del df removiendo Senegal que presenta datos nulos para tasa de homicidios
df_nonull = df[df['Nationality code'] != 'SEN'].copy()

# Transformar Year a variable ordinal de 1 (2008) a 15 (2022)
df_nonull['Year'] = df_nonull['Year'] - 2007

# Generar variables dummies a partir de nuestras variables categóricas "object" (no ordinales)
df_nonull = pd.get_dummies(df_nonull)

# Convertir las variables dummies booleanas en "int"
col_bool = df_nonull.select_dtypes(include = ['bool']).columns
df_nonull[col_bool] = df_nonull[col_bool].astype(int)

# Verificar cambio
df_nonull.info()
df_nonull

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 0 to 9359
Data columns (total 64 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Year                                      9000 non-null   int64  
 1   Immigrant count                           9000 non-null   int64  
 2   Liberal democracy index                   9000 non-null   float64
 3   Health equality                           9000 non-null   float64
 4   Judicial accountability                   9000 non-null   float64
 5   Public sector corrupt exchanges           9000 non-null   float64
 6   One-sided violence_deaths                 9000 non-null   int64  
 7   Non-state_deaths                          9000 non-null   int64  
 8   Intrastate_deaths                         9000 non-null   int64  
 9   Interstate_deaths                         9000 non-null   int64  
 10  Number of residents                      

Unnamed: 0,Year,Immigrant count,Liberal democracy index,Health equality,Judicial accountability,Public sector corrupt exchanges,One-sided violence_deaths,Non-state_deaths,Intrastate_deaths,Interstate_deaths,...,Continent_America,Continent_Asia,Continent_Europe,Sub-region_Africa,Sub-region_Asia,Sub-region_Central America and Caribbean,Sub-region_European Union,Sub-region_North America,Sub-region_Rest of Europe,Sub-region_South America
0,1,759,0.164,0.61,0.39,0.35,0,0,345,0,...,0,0,0,1,0,0,0,0,0,0
1,1,2938,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
2,1,1128,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
3,1,265,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
4,1,156,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,15,330,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9356,15,146,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9357,15,169,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9358,15,99,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0


### *Separar conjunto de entrenamiento y prueba*

In [5]:
# Separar variables input y variable target "Immigrant count" de df_copy
X = df_copy.drop("Immigrant count", axis = 1) # variables predictoras
y = df_copy["Immigrant count"]  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 58) # separar datos en conjunto train y test en un 75% / 25%
scaler = RobustScaler(quantile_range=(15.0, 85.0)) # definir scaler de datos 
X_train = scaler.fit_transform(X_train) # escalar los datos de entrenamiento
X_test = scaler.fit_transform(X_test) # # escalar los datos de prueba

In [65]:
# Separar variables input y variable target "Immigrant count" de df_copy
X2 = df_nonull.drop("Immigrant count", axis = 1) # variables predictoras
y2 = df_nonull["Immigrant count"]  # Target

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 58) # separar datos en conjunto train y test en un 75% / 25%
scaler = MinMaxScaler() # definir scaler de datos 
X2_train = scaler.fit_transform(X2_train) # escalar los datos de entrenamiento
X2_test = scaler.fit_transform(X2_test) # # escalar los datos de prueba

### *Modelos Lineales de Machine Learning*

In [76]:
# Separar variables input y variable target "Immigrant count" de df_null (dataframe sin atos nulos)
X = df_nonull.drop("Immigrant count", axis = 1) # variables predictoras
y = df_nonull["Immigrant count"]  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 58) # separar datos en conjunto train y test en un 75% / 25%
scaler = MinMaxScaler() # definir scaler de datos 
X_train = scaler.fit_transform(X_train) # escalar los datos de entrenamiento
X_test = scaler.fit_transform(X_test) # # escalar los datos de prueba

#### Regresion Lineal

In [13]:
model_lineal = LinearRegression() # definicion del modelo 

model_lineal.fit(X_train, y_train) # ajuste del modelo 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_lineal = model_lineal.predict(X_train)
y_test_pred_lineal = model_lineal.predict(X_test)

# Comparar métricas en conjunto train y test
print("R2 train:", np.round(r2_score(y_train, y_train_pred_lineal), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_lineal), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_lineal), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_lineal), 2))
print("")
print("R2 test:", np.round(r2_score(y_test, y_test_pred_lineal), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_lineal), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_lineal), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_lineal), 2))

R2 train: 0.436
MSE - train: 17297750.66
MAE - train: 1770.51
MAPE - train: 6.43

R2 test: 0.496
MSE - test: 15192071.07
MAE - test: 1812.01
MAPE - test: 4.38


#### Regresion lineal - Huber (ventaja: bajo efecto de outliers)

In [20]:
model_huber = HuberRegressor(epsilon=1.15, alpha = 0.05) # definicion del modelo 

model_huber.fit(X_train, y_train) # ajuste del modelo 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_huber = model_huber.predict(X_train)
y_test_pred_huber = model_huber.predict(X_test)

# Comparar métricas en conjunto train y test
print("R2 train:", np.round(r2_score(y_train, y_train_pred_huber), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_huber), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_huber), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_huber), 2))
print("")
print("R2 test:", np.round(r2_score(y_test, y_test_pred_huber), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_huber), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_huber), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_huber), 2))

R2 train: 0.246
MSE - train: 23138000.38
MAE - train: 1314.28
MAPE - train: 1.41

R2 test: 0.272
MSE - test: 21949164.75
MAE - test: 1381.02
MAPE - test: 0.94


#### Regresion lineal - RANSAC (ventaja: bueno para grandes outliers en "y")

In [63]:
model_ransac = RANSACRegressor(min_samples = 15) # definicion del modelo 

model_ransac.fit(X_train, y_train) # ajuste del modelo 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_ransac = model_ransac.predict(X_train)
y_test_pred_ransac = model_ransac.predict(X_test)

# Comparar métricas en conjunto train y test
print("R2 train:", np.round(r2_score(y_train, y_train_pred_ransac), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_ransac), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_ransac), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_ransac), 2))
print("")
print("R2 test:", np.round(r2_score(y_test, y_test_pred_ransac), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_ransac), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_ransac), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_ransac), 2))

R2 train: 0.046
MSE - train: 29276267.87
MAE - train: 1623.37
MAPE - train: 0.82

R2 test: 0.046
MSE - test: 28747029.95
MAE - test: 1705.79
MAPE - test: 0.66


#### Regresion lineal - TheilSen (ventaja: bueno para outliers pequeños tanto en "X" como en "y")

In [65]:
model_theilsen = TheilSenRegressor() # definicion del modelo 

model_theilsen.fit(X_train, y_train) # ajuste del modelo 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_theilsen = model_theilsen.predict(X_train)
y_test_pred_theilsen = model_theilsen.predict(X_test)

# Comparar métricas en conjunto train y test
print("R2 train:", np.round(r2_score(y_train, y_train_pred_theilsen), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_theilsen), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_theilsen), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_theilsen), 2))
print("")
print("R2 test:", np.round(r2_score(y_test, y_test_pred_theilsen), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_theilsen), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_theilsen), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_theilsen), 2))

R2 train: 0.401
MSE - train: 18369197.59
MAE - train: 1512.02
MAPE - train: 4.62

R2 test: 0.455
MSE - test: 16439423.53
MAE - test: 1559.53
MAPE - test: 3.43


#### Modelos lineales regularizados (Ridge, Lasso, E-Net)

**Buscra ALfa Optimo**

In [69]:
# Definir modelo Ridge y para Evaluar el valor del "alpha" óptimo
ridgecv = RidgeCV()
ridgecv.fit(X_train, y_train)
print("Alfa Optimo Ridge:", ridgecv.alpha_)

# Definir modelo Lasso y para Evaluar el valor del "alpha" óptimo
lassocv = LassoCV()
lassocv.fit(X_train, y_train)
print("Alfa Optimo Lasso:", lassocv.alpha_)

# Definir modelo E-Net y para Evaluar el valor del "alpha" óptimo
enetcv = ElasticNetCV()
enetcv.fit(X_train, y_train)
print("Alfa Optimo E-Net:", enetcv.alpha_)

Alfa Optimo Ridge: 1.0
Alfa Optimo Lasso: 2512.9660996867165
Alfa Optimo E-Net: 16457.662076138495


In [70]:
# Ingresamos el valor de alpha en una variable
alpha_opt_ridge = ridgecv.alpha_

# Ingresamos el valor de alpha en una variable
alpha_opt_lasso = lassocv.alpha_

# Ingresamos el valor de alpha en una variable
alpha_opt_enet = enetcv.alpha_

**Entrenar modelo con Alfa optimo**

In [71]:
# Definir modelo Ridge con nuestro valor optimo de alpha, entrenar y predecir
modelo_ridge = Ridge(alpha = alpha_opt_ridge)
y_test_ridge = modelo_ridge.fit(X_train, y_train).predict(X_test)

# Definir modelo Lasso con nuestro valor optimo de alpha, entrenar y predecir
modelo_lasso = Lasso(alpha = alpha_opt_lasso)
y_test_lasso = modelo_lasso.fit(X_train, y_train).predict(X_test)

# Definir modelo E-Net con nuestro valor optimo de alpha, entrenar y predecir
modelo_enet = ElasticNet(alpha = alpha_opt_enet)
y_test_enet = modelo_enet.fit(X_train, y_train).predict(X_test)

In [72]:
# Observar Coeficientes de cada variable para cada modelo en un dataframe
coefficients = pd.DataFrame({'Variable':df_nonull.drop(["Immigrant count"], axis=1, inplace=False).columns})
coefficients['modelo_ridge']= modelo_ridge.coef_
coefficients['modelo_lasso']= modelo_lasso.coef_
coefficients['modelo_net']= modelo_enet.coef_

# Mostrar coeficientes
coefficients

Unnamed: 0,Variable,modelo_ridge,modelo_lasso,modelo_net
0,Year,-2245.293336,0.0,0.0
1,Liberal democracy index,1350.160038,0.0,0.0
2,Health equality,-303.050285,-0.0,-0.0
3,Judicial accountability,914.793364,0.0,0.0
4,Public sector corrupt exchanges,-948.422426,-0.0,-0.0
...,...,...,...,...
58,Sub-region_Central America and Caribbean,286.563277,-0.0,-0.0
59,Sub-region_European Union,-230.070279,-0.0,0.0
60,Sub-region_North America,-405.188699,-0.0,-0.0
61,Sub-region_Rest of Europe,187.228141,-0.0,-0.0


**Evaluar y Comparar Métricas**

In [73]:
# Métricas en test - Ridge
print("R2 test - Ridge:", np.round(r2_score(y_test, y_test_ridge), 3))
print("MSE test - Ridge:", np.round(mean_squared_error(y_test, y_test_ridge), 2))
print("MAE test - Ridge:", np.round(mean_absolute_error(y_test, y_test_ridge), 2))
print("MAPE test - Ridge:", np.round(mean_absolute_percentage_error(y_test, y_test_ridge), 2))

R2 test - Ridge: 0.496
MSE test - Ridge: 15178889.63
MAE test - Ridge: 1819.62
MAPE test - Ridge: 4.45


In [74]:
# Métricas en test - Lasso
print("R2 test - Lasso:", np.round(r2_score(y_test, y_test_lasso), 3))
print("MSE test - Lasso:", np.round(mean_squared_error(y_test, y_test_lasso), 2))
print("MAE test - Lasso:", np.round(mean_absolute_error(y_test, y_test_lasso), 2))
print("MAPE test - Lasso:", np.round(mean_absolute_percentage_error(y_test, y_test_lasso), 2))

R2 test - Lasso: 0.058
MSE test - Lasso: 28379278.52
MAE test - Lasso: 2474.32
MAPE test - Lasso: 6.75


In [75]:
# Métricas en test - E-Net
print("R2 test - Lasso:", np.round(r2_score(y_test, y_test_enet), 3))
print("MSE test - Lasso:", np.round(mean_squared_error(y_test, y_test_enet), 2))
print("MAE test - Lasso:", np.round(mean_absolute_error(y_test, y_test_enet), 2))
print("MAPE test - Lasso:", np.round(mean_absolute_percentage_error(y_test, y_test_enet), 2))

R2 test - Lasso: 0.021
MSE test - Lasso: 29495631.89
MAE test - Lasso: 2550.14
MAPE test - Lasso: 7.1


#### Desicion Tree

In [77]:
# Definir diccionario de valores para parámetros 
params = {'max_depth': range(5,9), 
          'min_samples_leaf' : [1, 3, 4, 5], 
          'min_samples_split': [20, 30], 
          "criterion" : ["squared_error", "friedman_mse", "absolute_error", "poisson"] 
          } 

# Definir modelo y aplicar combinaciones de parametros según el diccinario 
tree = DecisionTreeRegressor() 
tree_cv = GridSearchCV(tree, params, cv = 3, refit = True, scoring = "neg_mean_squared_error") # elegir scoring deseano (r2, mae, mse, mape...) neg_mean_squared_error  neg_median_absolute_error  neg_mean_absolute_percentage_error

# Entrenar modelo con cada combinación de parámetro 
tree_cv.fit(X_train, y_train) 

# Montrar los valores de los parámetros 
print(tree_cv.best_params_)

{'criterion': 'absolute_error', 'max_depth': 8, 'min_samples_leaf': 5, 'min_samples_split': 20}


In [80]:
# Definir modelo con los mejores valores de parámetros 
tree_best =  DecisionTreeRegressor(max_depth = tree_cv.best_params_['max_depth'], 
                                   min_samples_leaf = tree_cv.best_params_['min_samples_leaf'],
                                   min_samples_split = 15, 
                                   criterion = tree_cv.best_params_['criterion'],) 

# Entrenar con el conjunto de entrenamiento 
tree_best.fit(X_train, y_train) 

 
# Aplicar modelo sobre los datos de traint y test para predecir el target
y_test_pred_tree = tree_best.predict(X_test) 
y_train_pred_tree = tree_best.predict(X_train) 

 
# Comparar métricas en conjunto train y test
print("R2 - train:", np.round(r2_score(y_train, y_train_pred_tree), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_tree), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_tree), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_tree), 2))
print("")
print("R2 - test:", np.round(r2_score(y_test, y_test_pred_tree), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_tree), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_tree), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_tree), 2))

R2 - train: 0.708
MSE - train: 8975782.89
MAE - train: 844.31
MAPE - train: 0.64

R2 - test: 0.68
MSE - test: 9644547.7
MAE - test: 1010.72
MAPE - test: 0.63


#### Random Forest

In [98]:
# Definir diccionario de valores para parámetros 
params = {'n_estimators': [100], 
	      'criterion' : ['squared_error', 'friedman_mse', 'poisson'],
          "min_samples_split": [30, 50, 70], 
          'min_samples_leaf' : [2, 3, 5],
          "max_depth": [7, 8],
          'max_features' : [1.0, 0.5, 0.7]
          }

# Definir modelo y aplicar combinaciones de parametros según el diccinario 
rf = RandomForestRegressor() 
rf_cv = GridSearchCV(rf, params, cv=3, scoring='neg_mean_squared_error').fit(X2_train, y2_train) # elegir scoring deseano (r2, mae, mse, mape...)

# Motrar mejores valores para parámeros
rf_cv.best_estimator_

NameError: name 'X2_train' is not defined

In [88]:
# Definir modelo con los mejores valores de parámetros (Nota: usar los mejores, pero hacer modificaciones para comparar metricas)
rf_best = RandomForestRegressor(n_estimators = 100, 
                           max_depth = 8, 
                           criterion = 'poisson', 
                           min_samples_split = 20,  #se ajustó a 20 para mejorar resultados en test
                           min_samples_leaf = 2,  
                          # max_features = 1.0, 
                           )

# Entrenar con el conjunto de entrenamiento 
rf_best.fit(X_train, y_train) 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_rf = rf_best.predict(X_train)
y_test_pred_rf = rf_best.predict(X_test)

# Comparar métricas en conjunto train y test (agregar la metrica deseada)
print("R2 - train:", np.round(r2_score(y_train, y_train_pred_rf), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_rf), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_rf), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_rf), 2))
print("")
print("R2 - test:", np.round(r2_score(y_test, y_test_pred_rf), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_rf), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_rf), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_rf), 2))

R2 - train: 0.8
MSE - train: 6139996.98
MAE - train: 770.69
MAPE - train: 0.85

R2 - test: 0.736
MSE - test: 7957175.2
MAE - test: 933.62
MAPE - test: 0.82


In [89]:
# Extraer importancia relativa de variables en el modelo
imp_rel_rf = rf_best.feature_importances_
importancias_rf = pd.DataFrame({"variable": X.columns, "importancia relativa": imp_rel_rf}) \
    .sort_values(by='importancia relativa', ascending = False)

# Mostrar top 10 variables más importantes
importancias_rf[:10]

Unnamed: 0,variable,importancia relativa
51,Age group_All,0.469616
9,Number of residents,0.275581
0,Year,0.039341
12,Number of Turist,0.028233
46,Age group_25 - 34,0.027795
11,Homicide Rate,0.024287
15,Año post_pandemia,0.020533
50,Age group_65+,0.01721
2,Health equality,0.012928
49,Age group_55 - 64,0.011524


#### Hist Gradient Boosting

In [155]:
# Separar variables input y variable target "Immigrant count" de df_copy
X = df_copy.drop("Immigrant count", axis = 1) # variables predictoras
y = df_copy["Immigrant count"]  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 58) # separar datos en conjunto train y test en un 75% / 25%
scaler = MinMaxScaler() # definir scaler de datos 
X_train = scaler.fit_transform(X_train) # escalar los datos de entrenamiento
X_test = scaler.fit_transform(X_test) # # escalar los datos de prueba

In [97]:
# Definir diccionario de valores para parámetros 
params = {'max_iter': [120], 
	      'loss' : ['squared_error', 'gamma', 'poisson'],
          "learning_rate": [0.1, 0.01, 0.001], 
          'min_samples_leaf' : [2, 3, 5],
          "max_depth": [6, 7, 8],
        #  'max_features' : [1.0, 0.5, 0.7],
          'l2_regularization' : [0.0, 0.1, 0.3] #usar si se tienen muchas variables
          }

# Definir modelo y aplicar combinaciones de parametros según el diccinario 
hgb = HistGradientBoostingRegressor() 
hgb_cv = GridSearchCV(hgb, params, cv=3, scoring='neg_mean_squared_error').fit(X_train, y_train) # elegir scoring deseano (r2, mae, mse, mape...)

hgb_cv.best_estimator_

243 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramFiles\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramFiles\Anaconda3\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 353, in fit
    self._validate_params()
  File "c:\ProgramFiles\Anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\ProgramFiles\Anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_param

In [156]:
# Definir modelo con los mejores valores de parámetros (Nota: usar los mejores, pero hacer modificaciones para comparar metricas)
hgb_best = HistGradientBoostingRegressor(
                           max_iter = 100, 
                           max_depth = 8,
                           loss = 'poisson', 
                          # min_samples_split = 20,
                           learning_rate = 0.09, 
                           min_samples_leaf = 35,
                           max_leaf_nodes = 22,
                        #  l2_regularization = 0.1,  
                        #  max_features = 1.0, 
                           )

# Entrenar con el conjunto de entrenamiento 
hgb_best.fit(X_train, y_train) 

# Aplicar modelo sobre los datos de traint y test para predecir el target
y_train_pred_hgb = hgb_best.predict(X_train)
y_test_pred_hgb = hgb_best.predict(X_test)

# Comparar métricas en conjunto train y test (agregar la metrica deseada)
print("R2 - train:", np.round(r2_score(y_train, y_train_pred_hgb), 3))
print("MSE - train:", np.round(mean_squared_error(y_train, y_train_pred_hgb), 2))
print("MAE - train:", np.round(mean_absolute_error(y_train, y_train_pred_hgb), 2))
print("MAPE - train:", np.round(mean_absolute_percentage_error(y_train, y_train_pred_hgb), 2))
print("")
print("R2 - test:", np.round(r2_score(y_test, y_test_pred_hgb), 3))
print("MSE - test:", np.round(mean_squared_error(y_test, y_test_pred_hgb), 2))
print("MAE - test:", np.round(mean_absolute_error(y_test, y_test_pred_hgb), 2))
print("MAPE - test:", np.round(mean_absolute_percentage_error(y_test, y_test_pred_hgb), 2))

R2 - train: 0.979
MSE - train: 618081.69
MAE - train: 353.11
MAPE - train: 0.51

R2 - test: 0.939
MSE - test: 1826561.42
MAE - test: 477.12
MAPE - test: 0.58


NOTA: USAR QUANTILES EN EL hgb PARA ESTIMAR INTERVALO DE CONFIANZA.

#### XGBoost

#### RNN