In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

from joblib import Parallel, delayed

from time import time

import plotly

import optuna
from optuna.visualization import plot_param_importances, plot_contour,  plot_slice, plot_optimization_history

In [2]:
dataset_path = 'C:/Eugenio/Maestria/DMEyF/datasets/'
dataset_file = 'competencia_01_fe.csv'

modelos_path = 'C:/Eugenio/Maestria/DMEyF/modelos/' # para guardar los modelos y no tener que volver a correrlos
db_path = 'C:/Eugenio/Maestria/DMEyF/db/'

semillas = [122219, 109279, 400391, 401537, 999961]

data = pd.read_csv(dataset_path+dataset_file)

In [3]:
mes_train = 202102
mes_test = 202104

In [5]:
# conjunto de train
X = data[data['foto_mes'] == mes_train]
y = X['clase_ternaria']
X = X.drop(columns=['clase_ternaria'])

# conjunto de test
X_futuro = data[data['foto_mes'] == mes_test]
y_futuro = X_futuro['clase_ternaria']
X_futuro = X_futuro.drop(columns=['clase_ternaria'])

In [7]:
ganancia_acierto = 273000
costo_estimulo = 7000

In [8]:
# variamos la funcion de ganancia para poder usarla de manera mas generica

def ganancia_prob(y_hat, y, prop=1, class_index=1, threshold=0.025):
  @np.vectorize
  def ganancia_row(predicted, actual, threshold=0.025):
    return  (predicted >= threshold) * (ganancia_acierto if actual == "BAJA+2" else -costo_estimulo)

  return ganancia_row(y_hat[:,class_index], y).sum() / prop

## Random Forest

Los parámetros que se pueden ajustar en el **rf** son

1. **n_estimators**: Número de árboles en el bosque.
2. **max_depth**: Profundidad máxima de los árboles.
3. **min_samples_split**: Número mínimo de muestras requeridas para dividir un nodo interno.
4. **min_samples_leaf**: Número mínimo de muestras requeridas para estar en un nodo hoja.
5. **max_features**: Número de features a usar en cada árbol. **sqrt** es una elección histórica.
6. **max_leaf_nodes**: Número máximo de nodos hoja en cada árbol.
7. **oob_score**: Indica si se usa la muestra fuera de bolsa (out-of-bag) para estimar la calidad del modelo. Para evitar hacer un **montecarlo-cross-validation** que se toma su tiempo, usaremos esta opción para buscar el mejor modelo. No es la mejor opción. Pero no es tan mala.
8. **n_jobs**: Siempre -1, para que use todos los cores presentes en 9. **max_samples**: Fracción de los samples.

Finalmente nuestra función de optimización queda la siguiente forma:

### Optimizacion

In [9]:
def objective(trial):
    
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 2000)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    max_features = trial.suggest_float('max_features', 0.05, 0.7)

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_samples=0.7,
        random_state=semillas[0],
        n_jobs=-1,
        oob_score=True
    )

    model.fit(X, y)

    return ganancia_prob(model.oob_decision_function_, y)


[I 2024-09-29 16:57:56,969] Using an existing study with name 'exp_206_random-forest-opt' instead of creating a new one.


In [None]:
storage_name = "sqlite:///" + db_path + "optimization_forest.db" # mmm no se si hacia falta cambiar el nombre de la base, creo que se puede usar la misma base y lo que hay que cambiar es el nombre del estudio
study_name = "exp_206_random-forest-opt"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

In [86]:
# study.optimize(objective, n_trials=100)

[I 2024-09-21 20:49:23,138] Trial 0 finished with value: 98987000.0 and parameters: {'max_depth': 9, 'min_samples_split': 85, 'min_samples_leaf': 27, 'max_features': 0.3490297402571165}. Best is trial 0 with value: 98987000.0.
[I 2024-09-21 20:56:18,995] Trial 1 finished with value: 94850000.0 and parameters: {'max_depth': 26, 'min_samples_split': 888, 'min_samples_leaf': 71, 'max_features': 0.6338603400919427}. Best is trial 0 with value: 98987000.0.
[I 2024-09-21 21:03:58,897] Trial 2 finished with value: 97790000.0 and parameters: {'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 56, 'max_features': 0.6717375095431893}. Best is trial 0 with value: 98987000.0.
[I 2024-09-21 21:05:14,947] Trial 3 finished with value: 98266000.0 and parameters: {'max_depth': 32, 'min_samples_split': 123, 'min_samples_leaf': 153, 'max_features': 0.13875635509093515}. Best is trial 0 with value: 98987000.0.
[I 2024-09-21 21:06:15,570] Trial 4 finished with value: 86037000.0 and parameters: {'

In [87]:
optuna.visualization.plot_optimization_history(study)

In [10]:
plot_param_importances(study)

In [11]:
plot_slice(study)

In [None]:
study.best_trial.params

### Modelo

In [12]:
model = RandomForestClassifier(n_estimators=100,
                               max_features=0.35,
                               max_depth=28,
                               min_samples_split = 199,
                               min_samples_leaf=95,
                               random_state=semillas[3])

model.fit(X, y)

In [15]:
print(f"Ganancia de modelo Base: {ganancia(model, X, y)}") # lo podes verificar en la tabla de arriba

Ganancia de modelo Base: 176225000.0


In [16]:
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
feature_importances.sort_values('importance', ascending=False).head(20)

Unnamed: 0,feature,importance
107,ctrx_quarter,0.095718
286,avg3_ctrx_quarter,0.095591
2,active_quarter,0.058121
201,avg3_mcaja_ahorro,0.036834
166,consumos_mas_DA,0.034722
33,mprestamos_personales,0.030956
204,avg3_mcuentas_saldo,0.03082
22,mcuentas_saldo,0.030752
154,suma_alt,0.030239
18,mcaja_ahorro,0.029581
