In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree,  _tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

from joblib import Parallel, delayed
from pathlib import Path
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [3]:
import os
print(os.path.getsize("data/df_confe.csv") / (1024**3), "GB")


2.5967229744419456 GB


In [4]:
import polars as pl

# Leer CSV completo
data = pl.read_csv("data/df_confe.csv")

print(data.shape)        # filas, columnas
print(data.head())


(978439, 755)
shape: (5, 755)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ numero_de ┆ foto_mes ┆ active_qu ┆ cliente_v ┆ … ┆ Visa_cade ┆ Visa_cade ┆ Visa_mpag ┆ Visa_mpag │
│ _cliente  ┆ ---      ┆ arter     ┆ ip        ┆   ┆ lantosefe ┆ lantosefe ┆ ominimo_d ┆ ominimo_d │
│ ---       ┆ i64      ┆ ---       ┆ ---       ┆   ┆ ctivo_dif ┆ ctivo_dif ┆ iff_prev  ┆ iff_prev2 │
│ i64       ┆          ┆ i64       ┆ i64       ┆   ┆ f_p…      ┆ f_p…      ┆ ---       ┆ ---       │
│           ┆          ┆           ┆           ┆   ┆ ---       ┆ ---       ┆ f64       ┆ f64       │
│           ┆          ┆           ┆           ┆   ┆ i64       ┆ i64       ┆           ┆           │
╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 249320580 ┆ 202101   ┆ 1         ┆ 0         ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
│ 249320580 ┆ 202102   ┆ 1         ┆ 0         ┆ … ┆ null    

In [5]:
#tiene "clase_ternaria"?
print("clase_ternaria" in data.columns)

True


In [6]:
SEMILLAS = [550007, 550019, 550031, 550033, 550047]

mes_train = 202102
mes_validacion = 202103
mes_test = 202104
mes_kaggle = 202106
ganancia_acierto = 780000
costo_estimulo = 20000
# =====================

In [7]:
# Filtrar solo el mes de train
df_train = data.filter(pl.col("foto_mes") == mes_train)

# Contar nulos por columna
null_counts = (
    df_train.select([
        pl.col(col).null_count().alias(col) for col in df_train.columns
    ])
    .to_dict(as_series=False)
)

# Pasarlo a un DataFrame ordenado
null_df = (
    pl.DataFrame({
        "columna": list(null_counts.keys()),
        "nulos": [v[0] for v in null_counts.values()]
    })
    .sort("nulos", descending=True)
)

#cuantos registros tiene df_train
n_filas = df_train.height
# Imprimir las primeras 15 filas
print(null_df.head(15))

shape: (15, 2)
┌─────────────────────────────────┬────────┐
│ columna                         ┆ nulos  │
│ ---                             ┆ ---    │
│ str                             ┆ i64    │
╞═════════════════════════════════╪════════╡
│ active_quarter_diff_prev2       ┆ 162155 │
│ cliente_vip_diff_prev2          ┆ 162155 │
│ internet_diff_prev2             ┆ 162155 │
│ cliente_edad_diff_prev2         ┆ 162155 │
│ cliente_antiguedad_diff_prev2   ┆ 162155 │
│ …                               ┆ …      │
│ cproductos_diff_prev2           ┆ 162155 │
│ tcuentas_diff_prev2             ┆ 162155 │
│ ccuenta_corriente_diff_prev2    ┆ 162155 │
│ mcuenta_corriente_adicional_di… ┆ 162155 │
│ mcuenta_corriente_diff_prev2    ┆ 162155 │
└─────────────────────────────────┴────────┘


In [8]:

#cuantas filas tiene el dataframe
n_filas = df_train.height    
# Imprimir las primeras 20 filas
print(null_df.head(20))

shape: (20, 2)
┌─────────────────────────────────┬────────┐
│ columna                         ┆ nulos  │
│ ---                             ┆ ---    │
│ str                             ┆ i64    │
╞═════════════════════════════════╪════════╡
│ active_quarter_diff_prev2       ┆ 162155 │
│ cliente_vip_diff_prev2          ┆ 162155 │
│ internet_diff_prev2             ┆ 162155 │
│ cliente_edad_diff_prev2         ┆ 162155 │
│ cliente_antiguedad_diff_prev2   ┆ 162155 │
│ …                               ┆ …      │
│ ccaja_ahorro_diff_prev2         ┆ 162155 │
│ mcaja_ahorro_diff_prev2         ┆ 162155 │
│ mcaja_ahorro_adicional_diff_pr… ┆ 162155 │
│ mcaja_ahorro_dolares_diff_prev… ┆ 162155 │
│ cdescubierto_preacordado_diff_… ┆ 162155 │
└─────────────────────────────────┴────────┘


In [9]:
#tiene mi df_train la columna "clase_ternaria"?
print("clase_ternaria" in df_train.columns)

True


In [10]:
# Filtrar train
X = data.filter(pl.col("foto_mes") == mes_train)
y = X["clase_ternaria"]
X = X.drop("clase_ternaria")

# Filtrar validación/futuro
X_futuro = data.filter(pl.col("foto_mes") == mes_validacion)
y_futuro = X_futuro["clase_ternaria"]
X_futuro = X_futuro.drop("clase_ternaria")


In [11]:

def ganancia_prob(y_hat, y, prop=1, class_index=1, threshold=0.025):
  @np.vectorize
  def ganancia_row(predicted, actual, threshold=0.025):
    return  (predicted >= threshold) * (ganancia_acierto if actual == "BAJA+2" else -costo_estimulo)

  return ganancia_row(y_hat[:,class_index], y).sum() / prop


     

In [13]:
#Parametros optimizados
param_opt = {'criterion': 'entropy',
             'max_depth': 20,
             'min_samples_split': 145,
             'min_samples_leaf': 14,
             'max_leaf_nodes': 13}

model_opt = DecisionTreeClassifier(random_state=SEMILLAS[0], **param_opt)

model_opt.fit(X, y)
y_pred_opt = model_opt.predict_proba(X_futuro)
print(f"Ganancia de modelo Opt: {ganancia_prob(y_pred_opt, y_futuro)}")

MemoryError: Unable to allocate 933. MiB for an array with shape (754, 162155) and data type object

#Ganancia modelo clase 211 MARS sin nada

## DF FINAL KAGGLE


In [None]:
# Concatenar los datos de entrenamiento
meses_train = [mes_train, mes_validacion, mes_test]
train_data = data[data['foto_mes'].isin(meses_train)]

X_train = train_data.drop(columns=['clase_ternaria'])  # numero_de_cliente queda
y_train = train_data['clase_ternaria']

# Datos de Kaggle
X_kaggle = data[data['foto_mes'] == mes_kaggle]
clientes_kaggle = X_kaggle['numero_de_cliente']
X_kaggle = X_kaggle.drop(columns=['clase_ternaria'])  # numero_de_cliente queda

# Modelo
param_opt = {'criterion': 'entropy',
             'max_depth': 20,
             'min_samples_split': 145,
             'min_samples_leaf': 14,
             'max_leaf_nodes': 13}

model_opt = DecisionTreeClassifier(random_state=SEMILLAS[0], **param_opt)
model_opt.fit(X_train, y_train)

# Predicciones probabilísticas
y_pred_prob = model_opt.predict_proba(X_kaggle)

# Transformar probabilidades en 0/1 con threshold 0.025
threshold = 0.025
class_index = list(model_opt.classes_).index("BAJA+2")
y_pred_bin = (y_pred_prob[:, class_index] > threshold).astype(int)

# Crear DataFrame final con las dos columnas
submission = pd.DataFrame({
    'numero_de_cliente': clientes_kaggle,
    'Predicted': y_pred_bin
})

submission.to_csv("predicciones_kaggle.csv", index=False)
submission.head()

Unnamed: 0,numero_de_cliente,Predicted
5,249221323,0
11,249227600,0
17,249234235,0
23,249244449,0
29,249244739,0


In [None]:
# Número de filas
print(submission.shape)  # devuelve (filas, columnas)

# Columnas
print(submission.columns)  # debe dar Index(['numero_de_cliente', 'Predicted'], dtype='object')

(164313, 2)
Index(['numero_de_cliente', 'Predicted'], dtype='object')
