In [1]:
import pandas as pd
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import polars as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pl.read_csv(
    "competencia_02_normalizada_1.csv",
    infer_schema_length=10000,
    schema_overrides={"Master_madelantodolares": pl.Float64}
)

In [3]:
dataset.shape

(4735593, 145)

In [4]:
competencia_02 = dataset.to_pandas()

### Feature Engineering

In [5]:
important_features = ['mpayroll', 'mpasivos_margen', 'mcomisiones', 'mcuentas_saldo']

In [6]:
competencia_02 = competencia_02.sort_values(by=['numero_de_cliente', 'foto_mes'])

In [7]:
for columna in important_features:
    competencia_02[f'{columna}_acumulado'] = competencia_02.groupby('numero_de_cliente')[columna].cumsum()

In [8]:
competencia_02[['numero_de_cliente','foto_mes','mpasivos_margen', 'mpasivos_margen_acumulado']]

Unnamed: 0,numero_de_cliente,foto_mes,mpasivos_margen,mpasivos_margen_acumulado
0,249221109,201901,67.683421,67.683421
124752,249221109,201902,53.145455,120.828876
250551,249221109,201903,57.412308,178.241183
376987,249221109,201904,78.893023,257.134207
503983,249221109,201905,0.000000,257.134207
...,...,...,...,...
4735588,1603590310,202108,0.000000,0.000000
4735589,1603703854,202108,0.009396,0.009396
4735590,1603775178,202108,0.357088,0.357088
4735591,1603805076,202108,0.000000,0.000000


### Optimización

In [9]:
dataset = competencia_02.copy()

In [10]:
dtrain = dataset[dataset["foto_mes"] <= 202106][dataset["foto_mes"]>202012]
dapply = dataset[dataset["foto_mes"] == 202108]

  dtrain = dataset[dataset["foto_mes"] <= 202106][dataset["foto_mes"]>202012]


In [11]:
min(dtrain['foto_mes'])

202101

In [12]:
max(dtrain['foto_mes'])

202106

In [13]:
X_train = dtrain.drop(columns=["clase_ternaria", "foto_mes"])
y_train = dtrain["clase_ternaria"]

In [14]:
X_apply = dapply.drop(columns=["clase_ternaria", "foto_mes"])

In [15]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imp_mean.fit_transform(X_train)
X_apply = imp_mean.fit_transform(X_apply)

In [16]:
ganancia_acierto = 273000
costo_estimulo = 7000
semillas = [1994]

In [17]:
def ganancia_prob(y_hat, y, prop=1, class_index=1, threshold=0.025):
  @np.vectorize
  def ganancia_row(predicted, actual, threshold=0.025):
    return  (predicted >= threshold) * (ganancia_acierto if actual == "BAJA+2" else -costo_estimulo)

  return ganancia_row(y_hat[:,class_index], y).sum() / prop

In [18]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 2000)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    max_features = trial.suggest_float('max_features', 0.05, 0.7)

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_samples=0.7,
        random_state=semillas[0],
        n_jobs=-1,
        oob_score=True
    )

    model.fit(X_train, y_train)

    return ganancia_prob(model.oob_decision_function_, y_train)

#storage_name = "sqlite:///" + db_path + "optimization_tree.db"
study_name = "exp_206_random-forest-opt"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    #storage=storage_name,
    load_if_exists=True,
)

[I 2024-12-19 20:43:53,472] A new study created in memory with name: exp_206_random-forest-opt


In [None]:
#study.optimize(objective, n_trials=100)

In [None]:
parameters: {'max_depth': 21, 'min_samples_split': 159, 'min_samples_leaf': 84, 'max_features': 0.3320855181470695}. Best is trial 76 with value: 624995000.0.

In [20]:
# Crear y entrenar el modelo de árbol de decisión
modelo = RandomForestClassifier(
        max_depth=21,
        min_samples_split=159,
        min_samples_leaf=84,
        max_features=0.3320855181470695,
        random_state=12000,
        n_jobs=-1,
        oob_score=True
    )

In [None]:
modelo.fit(X_train, y_train)

In [None]:
imputer = SimpleImputer(strategy='mean')
X_apply_imputed = imputer.fit_transform(X_apply)
prediccion = modelo.predict_proba(X_apply_imputed)
dapply["prob_baja2"] = prediccion[:, list(modelo.classes_).index("BAJA+2")]
dapply["Predicted"] = (dapply["prob_baja2"] > 1/40).astype(int)
dapply[dapply['Predicted']==1]
dapply.shape
dapply[["numero_de_cliente", "Predicted"]].to_csv("K02_shap_modelo_base.csv", index=False, sep=",")

In [19]:
semillas = [100]
def correr_modelo_varias_semillas(semilla):
    modelo = RandomForestClassifier(
        max_depth=21,
        min_samples_split=159,
        min_samples_leaf=84,
        max_features=0.3320855181470695,
        random_state=semilla,
        n_jobs=-1,
        oob_score=True
    )
    modelo.fit(X_train, y_train)
    imputer = SimpleImputer(strategy='mean')
    X_apply_imputed = imputer.fit_transform(X_apply)
    prediccion = modelo.predict_proba(X_apply_imputed)
    dapply["prob_baja2"] = prediccion[:, list(modelo.classes_).index("BAJA+2")]
    dapply["Predicted"] = (dapply["prob_baja2"] > 1/40).astype(int)
    dapply[["numero_de_cliente", "Predicted"]].to_csv(f"K02_shap_modelo_base_replicabilidad_{semilla}.csv", index=False, sep=",")
    return

In [20]:
for semilla in semillas:
    correr_modelo_varias_semillas(semilla)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dapply["prob_baja2"] = prediccion[:, list(modelo.classes_).index("BAJA+2")]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dapply["Predicted"] = (dapply["prob_baja2"] > 1/40).astype(int)
