In [46]:
# Librerías 
import pandas as pd
import polars as pl
import numpy as np
from statistics import mean
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from pprint import pprint
import pickle

In [2]:
# Lectura 
encig19_inc_raw = pd.read_csv("../data/2_interim/encig19_inc_clean.csv", encoding = "latin1")
variables_contextuales = pd.read_csv("../data/3_final/indicadores_fs.csv", encoding = "utf-8")

In [3]:
# Vista de los DFs 
encig19_inc_raw.head()
encig19_inc_raw.shape

(255, 6)

In [4]:
variables_contextuales.head()

Unnamed: 0,nom_ent,nom_mun,mun_inegi,anio,pobtot,cs_revenue_perCapita,cs_total_general_fund,cs_intergov_revs_total_revs,cs_propety_tax_total_revs,bs_operating_exp,...,egresos_inversiones_financieras_y_otras_provisiones,prop_corrup_per17,prop_corrup_inc17,prop_corrup5_inc17,prop_corrup_per19,prop_corrup_inc19,prop_corrup5_inc19,prop_corrup_per21,prop_corrup_inc21,prop_corrup5_inc21
0,Aguascalientes,Aguascalientes,1001,2018,948990,3845.651965,0.332662,0.606587,0.121541,0.713073,...,0,34.710836,12.990008,14.104639,,,,,,
1,Aguascalientes,Asientos,1002,2018,51536,3608.715888,0.063044,0.337801,0.021163,0.218249,...,0,,,,,,,,,
2,Aguascalientes,Calvillo,1003,2018,58250,6231.560086,0.115889,2.429067,0.031448,1.500763,...,0,,,,,,,,,
3,Aguascalientes,Cosío,1004,2018,17000,7270.570529,0.047614,0.027078,0.009705,0.017147,...,0,,,,,,,,,
4,Aguascalientes,Jesús María,1005,2018,129929,3913.122729,0.351805,0.481168,0.198061,0.593563,...,0,46.724076,4.166667,7.569346,,,,,,


In [5]:
# Limpieza

# Se delimitan las variables contextuales al año 2019
variables_contextuales19 = variables_contextuales[variables_contextuales.anio == 2019]

# Se crea el indicador de corrupcion
encig19_inc_clean = encig19_inc_raw
encig19_inc_clean["mean_corrup19"] = encig19_inc_clean.prop_corrup_inc19.mean()
encig19_inc_clean["corrup"] = np.where(encig19_inc_clean.prop_corrup_inc19 > encig19_inc_clean.prop_corrup_inc19.mean(), 1, 0)
encig19_inc_clean.head()
encig19_inc_clean.shape

(255, 8)

In [6]:
# Se crea un DF con la proporcion de corrupcion
efipem19_corr_raw = pd.merge(encig19_inc_clean, variables_contextuales19, on = "mun_inegi", how = "left")
efipem19_corr_raw.columns

Index(['mun_inegi', 'anio_x', 'nom_ent_x', 'nom_mun_x', 'prop_corrup_inc19_x',
       'prop_corrup5_inc19_x', 'mean_corrup19', 'corrup', 'nom_ent_y',
       'nom_mun_y', 'anio_y', 'pobtot', 'cs_revenue_perCapita',
       'cs_total_general_fund', 'cs_intergov_revs_total_revs',
       'cs_propety_tax_total_revs', 'bs_operating_exp', 'bs_operating_balance',
       'bs_expenditure_perCapita', 'lrs_direct_long_term_debt_pobtot',
       'lrs_debt_service_total_rev', 'mun_tipo', 'graproes', 'pea', 'pe_inac',
       'pocupada', 'pdesocup', 'psinder', 'pder_ss', 'analf', 'sbasc', 'ovsde',
       'ovsee', 'ovsae', 'ovpt', 'vhac', 'pl.5000', 'po2sm', 'im_2020',
       'gm_2020', 'imn_2020', 'egresos_total_de_egresos',
       'egresos_servicios_personales', 'egresos_materiales_y_suministros',
       'egresos_servicios_generales',
       'egresos_transferencias_asignaciones_subsidios_y_otras_ayudas',
       'egresos_bienes_muebles_inmuebles_e_intangibles',
       'egresos_inversion_publica', 'egres

In [9]:
# Se verifica por NAs
for col in pl.DataFrame(efipem19_corr_raw).get_columns():
    print(f'{col.name} - {col.is_null().sum()}') # Ya no hay NAs

mun_inegi - 0
anio_x - 0
nom_ent_x - 0
nom_mun_x - 0
prop_corrup_inc19_x - 0
prop_corrup5_inc19_x - 1
mean_corrup19 - 0
corrup - 0
nom_ent_y - 22
nom_mun_y - 22
anio_y - 22
pobtot - 22
cs_revenue_perCapita - 22
cs_total_general_fund - 22
cs_intergov_revs_total_revs - 22
cs_propety_tax_total_revs - 36
bs_operating_exp - 22
bs_operating_balance - 22
bs_expenditure_perCapita - 22
lrs_direct_long_term_debt_pobtot - 83
lrs_debt_service_total_rev - 83
mun_tipo - 22
graproes - 22
pea - 22
pe_inac - 22
pocupada - 22
pdesocup - 22
psinder - 22
pder_ss - 22
analf - 22
sbasc - 22
ovsde - 22
ovsee - 22
ovsae - 22
ovpt - 22
vhac - 22
pl.5000 - 22
po2sm - 22
im_2020 - 22
gm_2020 - 22
imn_2020 - 22
egresos_total_de_egresos - 22
egresos_servicios_personales - 22
egresos_materiales_y_suministros - 22
egresos_servicios_generales - 22
egresos_transferencias_asignaciones_subsidios_y_otras_ayudas - 22
egresos_bienes_muebles_inmuebles_e_intangibles - 22
egresos_inversion_publica - 22
egresos_deuda_publica -

In [12]:
# Se eliminan las columnas extras
efipem19_corr_clean = (
                            pl.DataFrame(efipem19_corr_raw)
                            .lazy()
                            .select(pl.col("mun_inegi", "nom_ent_x", "nom_mun_x", 'mun_tipo', 'graproes', 'pea', 'pe_inac', 'pocupada', 'pdesocup',
       'psinder', 'pder_ss', 'analf', 'sbasc', 'ovsde', 'ovsee', 'ovsae',
       'ovpt', 'vhac', 'pl.5000', 'po2sm', 'im_2020',
       'egresos_total_de_egresos', 'egresos_servicios_personales',
       'egresos_materiales_y_suministros', 'egresos_servicios_generales',
       'egresos_transferencias_asignaciones_subsidios_y_otras_ayudas',
       'egresos_bienes_muebles_inmuebles_e_intangibles',
       'egresos_inversion_publica', 'egresos_deuda_publica',
       'ingresos_total_de_ingresos', 'ingresos_impuestos',
       'ingresos_contribuciones_de_mejoras', 'ingresos_derechos',
       'ingresos_productos', 'ingresos_aprovechamientos',
       'ingresos_participaciones_federales',
       'ingresos_aportaciones_federales_y_estatales',
       'ingresos_financiamiento', 'egresos_disponibilidad_final',
       'egresos_otros_egresos', 'ingresos_otros_ingresos',
       'ingresos_disponibilidad_inicial',
       'egresos_inversiones_financieras_y_otras_provisiones', "corrup").fill_null(0))
                            .collect()
                            .to_dummies(columns = ["mun_tipo"])
                            .drop("mun_tipo_0")
                        )
efipem19_corr_clean

mun_inegi,nom_ent_x,nom_mun_x,mun_tipo_metropolitano,mun_tipo_rural,mun_tipo_transición_rural-urbano,mun_tipo_urbano,graproes,pea,pe_inac,pocupada,pdesocup,psinder,pder_ss,analf,sbasc,ovsde,ovsee,ovsae,ovpt,vhac,pl.5000,po2sm,im_2020,egresos_total_de_egresos,egresos_servicios_personales,egresos_materiales_y_suministros,egresos_servicios_generales,egresos_transferencias_asignaciones_subsidios_y_otras_ayudas,egresos_bienes_muebles_inmuebles_e_intangibles,egresos_inversion_publica,egresos_deuda_publica,ingresos_total_de_ingresos,ingresos_impuestos,ingresos_contribuciones_de_mejoras,ingresos_derechos,ingresos_productos,ingresos_aprovechamientos,ingresos_participaciones_federales,ingresos_aportaciones_federales_y_estatales,ingresos_financiamiento,egresos_disponibilidad_final,egresos_otros_egresos,ingresos_otros_ingresos,ingresos_disponibilidad_inicial,egresos_inversiones_financieras_y_otras_provisiones,corrup
i64,str,str,u8,u8,u8,u8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
1001,"""Aguascalientes...","""Aguascalientes...",1,0,0,0,10.84,486675.0,269195.0,476502.0,10173.0,173199.0,774363.0,1.644738,20.36722,0.104799,0.113169,0.37861,0.591434,10.33953,7.523683,54.226594,60.318795,3.5741e9,1.5872e9,6.1592189e8,5.81511256e8,4.87114758e8,3.1274269e7,1.95502027e8,7.5553581e7,3.5741e9,5.31295259e8,135340.0,6.40793344e8,5.570778e6,4.2059037e7,1.4938e9,7.74585855e8,8.5844402e7,0.0,0.0,0.0,0.0,0.0,1
1005,"""Aguascalientes...","""Jesús María""",0,0,0,1,10.22,65839.0,33247.0,64945.0,894.0,27299.0,102428.0,2.380588,26.692477,0.277034,0.354957,0.860426,1.312652,16.404575,37.164143,56.748753,59.011762,4.93812771e8,2.93205233e8,3.8786739e7,7.1202044e7,4.0418053e7,6.240976e6,2.7401471e7,9.05315e6,4.93812771e8,1.11248243e8,246385.0,5.4634723e7,2.514085e6,3.135603e6,2.02849402e8,1.1918433e8,0.0,7.505105e6,0.0,0.0,0.0,0.0,0
2001,"""Baja Californi...","""Ensenada""",1,0,0,0,10.34,231560.0,127562.0,228454.0,3106.0,77808.0,363941.0,2.287974,25.243932,0.346372,1.410374,4.574525,1.717626,13.80882,13.570088,79.431857,58.082238,1.8143e9,1.2600e9,1.12529943e8,1.80159171e8,6.7657917e7,5.326569e6,5.1925519e7,1.02192096e8,1.8143e9,3.75957159e8,0.0,2.37359543e8,5.385153e6,5.1052491e7,5.91814205e8,5.5177064e8,924431.0,3.4442092e7,0.0,0.0,0.0,0.0,1
2002,"""Baja Californi...","""Mexicali""",1,0,0,0,10.53,548784.0,311290.0,540835.0,7949.0,190666.0,854633.0,1.618303,21.501444,0.155113,0.396584,0.676314,1.226287,12.588088,11.394543,69.895625,59.384989,4.2665e9,2.6893e9,2.42923802e8,5.03028228e8,2.02289134e8,6.1974603e7,3.49582895e8,1.48544928e8,4.2665e9,9.3605859e8,0.0,4.84640503e8,4.2277257e7,1.56424139e8,1.3703e9,1.2705e9,0.0,6.8805483e7,0.0,6.226597e6,0.0,0.0,0
2004,"""Baja Californi...","""Tijuana""",1,0,0,0,10.18,1.051417e6,501232.0,1.03846e6,12957.0,481326.0,1.426523e6,1.46975,24.583879,0.1561,0.184237,1.159385,2.147023,14.972565,1.704115,74.592469,59.140287,7.6687e9,4.0493e9,9.12150806e8,1.0134e9,4.22021032e8,2.09646745e8,6.75515275e8,2.87177314e8,7.6687e9,2.0113e9,0.0,5.37315285e8,5.0323219e7,1.99445086e8,2.5413e9,2.3258e9,0.0,8.9613204e7,9.886582e6,3.271233e6,0.0,0.0,0
3003,"""Baja Californi...","""La Paz""",1,0,0,0,11.07,154830.0,84914.0,152265.0,2565.0,41161.0,249591.0,1.783446,19.888742,0.436283,1.077344,2.36889,2.106201,11.964861,9.818609,48.345112,59.853253,1.5905e9,7.96103297e8,1.99049175e8,2.85823447e8,6.9698011e7,6.7252376e7,1.28395181e8,4.4178072e7,1.5905e9,3.22094049e8,0.0,9.5047053e7,2.829005e6,1.0950621e7,5.97748127e8,3.56272638e8,1.81651856e8,0.0,0.0,2.390621e7,0.0,0.0,0
4002,"""Campeche""","""Campeche""",1,0,0,0,10.72,149442.0,89926.0,147105.0,2337.0,57005.0,236338.0,3.308902,22.971498,0.716927,0.340677,0.655017,0.980697,24.155493,9.925292,67.749146,58.640701,1.6501e9,5.9914607e8,1.03725697e8,3.21835076e8,2.1815469e8,2.0599468e7,8.8877863e7,9.8279901e7,1.6501e9,1.06001123e8,0.0,2.59227688e8,1.5310541e7,1.5323761e7,6.81212532e8,5.04044024e8,6.9e7,1.99500904e8,0.0,0.0,0.0,0.0,0
4003,"""Campeche""","""Carmen""",1,0,0,0,10.26,125297.0,70830.0,122704.0,2593.0,57600.0,187057.0,3.928864,26.493564,0.666552,0.663278,5.758929,2.418576,27.367546,14.859451,55.028952,58.095083,1.7477e9,7.26456043e8,1.1174574e8,3.69468848e8,1.38229914e8,1.0199904e7,1.70118866e8,2.21508706e8,1.7477e9,1.10959683e8,0.0,1.35210295e8,9.580904e6,5.6254141e7,7.17380783e8,6.36085256e8,8.1572059e7,0.0,0.0,684900.0,0.0,0.0,0
5002,"""Coahuila de Za...","""Acuña""",1,0,0,0,9.4,82890.0,42887.0,81675.0,1215.0,31769.0,131001.0,1.684883,27.118247,0.169818,0.316255,0.590671,0.896511,18.769361,1.737419,87.167368,58.481712,6.02008103e8,2.36892629e8,4.4466243e7,8.6126467e7,9.4765013e7,1.4651265e7,1.18634582e8,6.471904e6,6.02008103e8,9.207206e7,61720.0,8.0511324e7,5.999949e6,8.162555e6,2.36954078e8,1.55947624e8,2.2298793e7,0.0,0.0,0.0,0.0,0.0,1
5010,"""Coahuila de Za...","""Frontera""",0,0,0,1,10.0,36013.0,28388.0,34852.0,1161.0,13095.0,69234.0,1.500982,21.339072,0.130126,0.154449,0.383081,0.322275,17.841221,8.696866,50.310298,60.084271,2.64593807e8,1.18168716e8,2.7657376e7,6.3984621e7,9.123231e6,6.686434e6,3.8973429e7,0.0,2.64593807e8,4.1438118e7,0.0,1.5167503e7,205481.0,1.17755e6,1.3132082e8,7.2671271e7,2.613064e6,0.0,0.0,0.0,0.0,0.0,0


In [13]:
# Se verifica por NAs
for col in efipem19_corr_clean.get_columns():
    print(f'{col.name} - {col.is_null().sum()}') # Ya no hay NAs

mun_inegi - 0
nom_ent_x - 0
nom_mun_x - 0
mun_tipo_metropolitano - 0
mun_tipo_rural - 0
mun_tipo_transición_rural-urbano - 0
mun_tipo_urbano - 0
graproes - 0
pea - 0
pe_inac - 0
pocupada - 0
pdesocup - 0
psinder - 0
pder_ss - 0
analf - 0
sbasc - 0
ovsde - 0
ovsee - 0
ovsae - 0
ovpt - 0
vhac - 0
pl.5000 - 0
po2sm - 0
im_2020 - 0
egresos_total_de_egresos - 0
egresos_servicios_personales - 0
egresos_materiales_y_suministros - 0
egresos_servicios_generales - 0
egresos_transferencias_asignaciones_subsidios_y_otras_ayudas - 0
egresos_bienes_muebles_inmuebles_e_intangibles - 0
egresos_inversion_publica - 0
egresos_deuda_publica - 0
ingresos_total_de_ingresos - 0
ingresos_impuestos - 0
ingresos_contribuciones_de_mejoras - 0
ingresos_derechos - 0
ingresos_productos - 0
ingresos_aprovechamientos - 0
ingresos_participaciones_federales - 0
ingresos_aportaciones_federales_y_estatales - 0
ingresos_financiamiento - 0
egresos_disponibilidad_final - 0
egresos_otros_egresos - 0
ingresos_otros_ingresos -

In [33]:
indicadores_fs_raw = pd.read_csv("../data/3_final/indicadores_fs.csv")
indicadores_fs_clean = indicadores_fs_raw[indicadores_fs_raw.prop_corrup_inc19.notnull()]

# Remove non-relevant columns
indicadores_fs_clean = indicadores_fs_clean.drop(["nom_ent", "nom_mun", "mun_inegi", "anio", "gm_2020", "prop_corrup_per19", "prop_corrup5_inc19", "ovsae", "ovpt", "po2sm"], axis = 1)

# Remove measure of corruption from other years
indicadores_fs_clean = indicadores_fs_clean.loc[:, ~indicadores_fs_clean.columns.str.endswith("17")]
indicadores_fs_clean = indicadores_fs_clean.loc[:, ~indicadores_fs_clean.columns.str.endswith("21")]

na_cols = []
for col in indicadores_fs_clean.columns:
  na_count = indicadores_fs_clean[col].isna().sum()
  if na_count > 0:
    print(f'{col} - {na_count}')
    na_cols.append(col) 

# Fill NAs
indicadores_fs_clean = indicadores_fs_clean.fillna(value = {"cs_propety_tax_total_revs": indicadores_fs_clean.cs_propety_tax_total_revs.median(),
                                                    "lrs_direct_long_term_debt_pobtot": indicadores_fs_clean.lrs_direct_long_term_debt_pobtot.median(),
                                                    "lrs_debt_service_total_rev": indicadores_fs_clean.lrs_debt_service_total_rev.median()})

indicadores_fs_clean = pd.get_dummies(indicadores_fs_clean, columns = ["mun_tipo"])

# Add feature variable 
indicadores_fs_clean["corrup"] = np.where(indicadores_fs_clean.prop_corrup_inc19 > indicadores_fs_clean.prop_corrup_inc19.mean(), 1, 0)
indicadores_fs_clean = indicadores_fs_clean.drop(["prop_corrup_inc19"], axis = 1)

# Distribution of classes
indicadores_fs_clean.corrup.value_counts()

cs_propety_tax_total_revs - 14
lrs_direct_long_term_debt_pobtot - 61
lrs_debt_service_total_rev - 61


0    119
1    114
Name: corrup, dtype: int64

In [34]:
# Split features from target
X = np.array(indicadores_fs_clean.drop(["corrup"], axis = 1))
Y = np.array(indicadores_fs_clean["corrup"])

# Normalize features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Split Train, test datasets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# Compare size of classes in sets
class_train, counts_train = np.unique(y_train, return_counts = True)
class_test, counts_test = np.unique(y_test, return_counts = True)

print("Y train")
print(np.asarray((class_train, counts_train)).T)
print("Y test")
print(np.asarray((class_test, counts_test)).T)


X_train: (186, 51)
X_test: (47, 51)
y_train: (186,)
y_test: (47,)
Y train
[[ 0 94]
 [ 1 92]]
Y test
[[ 0 25]
 [ 1 22]]


In [14]:
# Se separan las features del target
X = efipem19_corr_clean.drop(["mun_inegi", "nom_ent_x", "nom_mun_x", "corrup"]).to_numpy()
Y = efipem19_corr_clean.select(pl.col("corrup")).to_numpy()

# Se normalizan los datos en X
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

print("Conjunto completo:", efipem19_corr_clean.shape)
print("Features:", X.shape)
print("Target:", Y.shape)

# Se crean los conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Conjunto completo: (255, 47)
Features: (255, 43)
Target: (255, 1)
X_train: (204, 43)
X_test: (51, 43)
y_train: (204, 1)
y_test: (51, 1)


In [35]:
# Se compara la prevalencia de la clase positiva en ambos conjuntos
class_train, counts_train = np.unique(y_train, return_counts = True)
class_test, counts_test = np.unique(y_test, return_counts = True)

print("Y train")
print(np.asarray((class_train, counts_train)).T)
print("Y test")
print(np.asarray((class_test, counts_test)).T)
# Lucen bien balanceadas

Y train
[[ 0 94]
 [ 1 92]]
Y test
[[ 0 25]
 [ 1 22]]


In [72]:
# Cross Validation

## Random Search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 20)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 25, 30, 35, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)


{'bootstrap': [True, False],
 'max_depth': [10,
               20,
               30,
               40,
               50,
               60,
               70,
               80,
               90,
               100,
               110,
               120,
               130,
               140,
               150,
               160,
               170,
               180,
               190,
               200,
               None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
 'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 35, 40],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [83]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42)

print("Parameters of RF:\n")
pprint(rf.get_params())
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=2,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train);

print("BEST PARAMS RF:\n")
print(rf_random.best_params_)

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(y_true = test_labels, y_pred = predictions)
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy*100))
    
    return accuracy

# Evaluate base model
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

# Evaluate random search model
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Parameters of RF:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
BEST PARAMS RF:

{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'auto', 'max_depth': 190, 'bootstrap': True}
Model Performance
Accuracy = 48.94%.
Model Performance
Accuracy = 55.32%.
Improvement of 13.04%.


  warn(


In [87]:
# Validacion cruzada

kf = KFold(n_splits = 10)
CLF_rf = []
f1_rf = []
for train_idx, val_idx in kf.split(X_train):

    # Conjuntos de entrenamiento y validacion
    X_train_kf, y_train_kf = X_train[train_idx], y_train[train_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    # Random Forest
    clf = RandomForestClassifier(n_estimators = rf_random.best_params_["n_estimators"],
                                 min_samples_split = rf_random.best_params_["min_samples_leaf"],
                                 min_samples_leaf = rf_random.best_params_["min_samples_leaf"],
                                max_features = rf_random.best_params_["max_features"],
                                max_depth = rf_random.best_params_["max_depth"],
                                bootstrap = rf_random.best_params_["bootstrap"]).fit(X_train_kf, y_train_kf)

    # Predicciones
    y_pred = clf.predict(X_val)

    CLF_rf.append(clf)

    print("-"*50)
    print("Resultados del entrenamiento:", len(CLF_rf)-1)
    report = classification_report(y_val, y_pred, digits = 4, output_dict = True)
    f1 = report["macro avg"]["f1-score"]
    f1_rf.append(f1)
    print("Macro avg - F1-score:", round(f1, 4))

# Media total 
print("F1 medio: ", mean(f1_rf))


# Evaluacion final
y_pred_final = CLF_rf[f1_rf.index(max(f1_rf))].predict(X_test)
print(f"RANDOM FOREST EVALUATION\nUsing model with F1 = {max(f1_rf)}")
print(classification_report(y_test, y_pred_final))

  warn(


--------------------------------------------------
Resultados del entrenamiento: 0
Macro avg - F1-score: 0.4722


  warn(


--------------------------------------------------
Resultados del entrenamiento: 1
Macro avg - F1-score: 0.7339


  warn(


--------------------------------------------------
Resultados del entrenamiento: 2
Macro avg - F1-score: 0.5128


  warn(


--------------------------------------------------
Resultados del entrenamiento: 3
Macro avg - F1-score: 0.4722


  warn(


--------------------------------------------------
Resultados del entrenamiento: 4
Macro avg - F1-score: 0.5682


  warn(


--------------------------------------------------
Resultados del entrenamiento: 5
Macro avg - F1-score: 0.6275


  warn(


--------------------------------------------------
Resultados del entrenamiento: 6
Macro avg - F1-score: 0.2571


  warn(


--------------------------------------------------
Resultados del entrenamiento: 7
Macro avg - F1-score: 0.6494


  warn(


--------------------------------------------------
Resultados del entrenamiento: 8
Macro avg - F1-score: 0.55


  warn(


--------------------------------------------------
Resultados del entrenamiento: 9
Macro avg - F1-score: 0.6099
F1 medio:  0.5453191940498442
RANDOM FOREST EVALUATION
Using model with F1 = 0.7338935574229692
              precision    recall  f1-score   support

           0       0.59      0.76      0.67        25
           1       0.60      0.41      0.49        22

    accuracy                           0.60        47
   macro avg       0.60      0.58      0.58        47
weighted avg       0.60      0.60      0.58        47



In [18]:
# Guardar modelo
pickle.dump(CLF_rf[0], open("../models/rf_90acc.sav", "wb"))