#Clase ternaria

In [1]:
import os, sys
print(os.getcwd())   # dónde está parado el notebook
print(sys.path)      # lista de rutas donde busca módulos
print(os.listdir())  # qué archivos hay en el directorio actual
import sys, os
import polars as pl
import duckdb 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

c:\Users\Flor\Documents\UBA\DMEyF
['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\python311.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv', '', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\win32', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\Pythonwin']
['.git', '.gitignore', 'Clase_ternaria.py', 'data', 'Experimentos 1+slope.ipynb', 'Experimentos 2.ipynb', 'Experimentos.ipynb', 'FeatureEng_video.ipynb', 'logs', 'main.py', 

#Feature Engeneering

In [3]:
def col_selection(df: pl.DataFrame) -> tuple[list[str], list[list[str]]]:
    # Columns to drop
    col_drops = {
        "numero_de_cliente", "foto_mes", "active_quarter", "clase_ternaria",
        "cliente_edad", "cliente_antiguedad",
        "Visa_fultimo_cierre", "Master_fultimo_cierre",
        "Visa_Fvencimiento", "Master_Fvencimiento"
    }

    # --- Categorical vs Numerical ---
    cat_cols = []
    num_cols = []
    for c in df.columns:
        if c in col_drops:
            continue
        # Check data type before checking unique values
        if df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
            nunique = df.select(pl.col(c).n_unique()).item()
            if nunique <= 5:
                cat_cols.append(c)
            else:
                num_cols.append(c)

    # --- Prefix-based splits ---
    lista_t = [c for c in df.columns if c.startswith("t") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_c = [c for c in df.columns if c.startswith("c") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_m = [c for c in df.columns if c.startswith("m") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_r = [c for c in df.columns if c not in (lista_t + lista_c + lista_m + list(col_drops)) and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]


    # --- Features for lags, deltas, max/min, regression ---
    cols_lag_delta_max_min_regl = num_cols # Use only the identified numeric columns


    # --- Ratios: match c-columns with m-columns (same suffix) ---
    cols_ratios = []
    for c in lista_c:
        suffix = c[1:]
        match = next((m for m in lista_m if m[1:] == suffix), None)
        if match:
            cols_ratios.append([match, c])

    return cols_lag_delta_max_min_regl, cols_ratios

def run_duckdb_query(df: pl.DataFrame, sql: str) -> pl.DataFrame:
    """Executes a DuckDB SQL query over a DataFrame and returns the result."""
    with duckdb.connect(database=":memory:") as con:
        con.register("df", df)
        result = con.execute(sql).pl()
    return result

def feature_engineering_pipeline(df: pl.DataFrame, config: dict) -> pl.DataFrame:
    """
    Ejecuta el pipeline de feature engineering completo

    Parameters:
    -----------
    data_path : str
        Ruta al archivo de datos
    config : dict
        Configuración del pipeline. Ejemplo:

        "lag": {
            "columns": ["col1", "col2"],
            "n": 2   # number of lags
        },
        "delta": {
            "columns": ["col1", "col2"],
            "n": 2   # number of deltas
        },
        "minmax": {
            "columns": ["col1", "col2"]
        },
        "ratio": {
            "pairs": [["monto", "cantidad"], ["ingresos", "clientes"]]
        },
        "linreg": {
            "columns": ["col1"],
            "window": 3  # optional, for flexibility
        }

    Returns:
    --------
    pl.DataFrame
        DataFrame con las nuevas features agregadas
    """

    sql = "SELECT *"

    window_clause = ""

    if "lag" in config:
        sql += add_lag_sql(config["lag"])

    if "delta" in config:
        sql += add_delta_sql(config["delta"])

    if "minmax" in config:
        sql += add_minmax_sql(config["minmax"])

    if "ratio" in config:
        sql += add_ratio_sql(config["ratio"])

    if "linreg" in config:
        linreg_str, window_clause = add_linreg_sql(config["linreg"])
        sql += linreg_str

    sql += " FROM df"
    if window_clause != "":
        sql += window_clause

    df = run_duckdb_query(df, sql)

    return df

def add_lag_sql(config_lag: dict) -> str:
    lag_str = ""
    for col in config_lag["columns"]:
        lag_str += f", lag({col}, {config_lag['n']}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS {col}_lag_{config_lag['n']}"

    return lag_str

def add_delta_sql(config_delta: dict) -> str:
    delta_str = ""
    for col in config_delta["columns"]:
        for i in range(1, config_delta["n"] + 1):
             delta_str += f", {col} - lag({col}, {i}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS {col}_delta_{i}"
    return delta_str

def add_minmax_sql(config_minmax: dict) -> str:
    min_max_sql = ""
    for col in config_minmax["columns"]:
        min_max_sql += f", MAX({col}) OVER (PARTITION BY numero_de_cliente) AS {col}_MAX, MIN({col}) OVER (PARTITION BY numero_de_cliente) AS {col}_MIN"

    return min_max_sql

def add_ratio_sql(config_ratio: dict) -> str:
    ratio_sql = ""
    for pair in config_ratio["pairs"]:
        ratio_sql += f", IF({pair[1]} = 0, 0, {pair[0]} / {pair[1]}) AS ratio_{pair[0]}_{pair[1]}"

    return ratio_sql

def add_linreg_sql(config_linreg: dict) -> tuple:
    linreg_sql = ""
    window_size = config_linreg.get("window", 3)
    for col in config_linreg["columns"]:
        linreg_sql += f", REGR_SLOPE({col}, cliente_antiguedad) OVER ventana_{window_size} AS slope_{col}"

    window_clause = f" WINDOW ventana_{window_size} AS (PARTITION BY numero_de_cliente ORDER BY foto_mes ROWS BETWEEN {window_size} PRECEDING AND CURRENT ROW)"

    return linreg_sql, window_clause

In [4]:
# Supongamos que el archivo se llama "datos.csv"
df = pl.read_csv("data/competencia_01_con_clase_ternaria.csv")

In [5]:
cols_lag_delta_max_min_regl, cols_ratios = col_selection(df)

In [6]:
# 2. Feature Engineering
df = feature_engineering_pipeline(df, {
  "lag": {
    "columns": cols_lag_delta_max_min_regl,
    "n": 2
  },
  "delta": {
    "columns": cols_lag_delta_max_min_regl,
    "n": 2
  },
  #  "minmax": {
  #      "columns": cols_lag_delta_max_min_regl
  #  },
  # "ratio": {
  #   "pairs": cols_ratios
  # },
  # "linreg": {
  #   "columns": cols_lag_delta_max_min_regl,
  #   "window": 3
  # }
})

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
df.head()

numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,mactivos_margen,mpasivos_margen,cproductos,tcuentas,ccuenta_corriente,mcuenta_corriente_adicional,mcuenta_corriente,ccaja_ahorro,mcaja_ahorro,mcaja_ahorro_adicional,mcaja_ahorro_dolares,cdescubierto_preacordado,mcuentas_saldo,ctarjeta_debito,ctarjeta_debito_transacciones,mautoservicio,ctarjeta_visa,ctarjeta_visa_transacciones,mtarjeta_visa_consumo,ctarjeta_master,ctarjeta_master_transacciones,mtarjeta_master_consumo,cprestamos_personales,mprestamos_personales,cprestamos_prendarios,mprestamos_prendarios,cprestamos_hipotecarios,…,Master_cadelantosefectivo_delta_2,Master_mpagominimo_delta_1,Master_mpagominimo_delta_2,Visa_mfinanciacion_limite_delta_1,Visa_mfinanciacion_limite_delta_2,Visa_msaldototal_delta_1,Visa_msaldototal_delta_2,Visa_msaldopesos_delta_1,Visa_msaldopesos_delta_2,Visa_msaldodolares_delta_1,Visa_msaldodolares_delta_2,Visa_mconsumospesos_delta_1,Visa_mconsumospesos_delta_2,Visa_mconsumosdolares_delta_1,Visa_mconsumosdolares_delta_2,Visa_mlimitecompra_delta_1,Visa_mlimitecompra_delta_2,Visa_madelantopesos_delta_1,Visa_madelantopesos_delta_2,Visa_madelantodolares_delta_1,Visa_madelantodolares_delta_2,Visa_mpagado_delta_1,Visa_mpagado_delta_2,Visa_mpagospesos_delta_1,Visa_mpagospesos_delta_2,Visa_mpagosdolares_delta_1,Visa_mpagosdolares_delta_2,Visa_fechaalta_delta_1,Visa_fechaalta_delta_2,Visa_mconsumototal_delta_1,Visa_mconsumototal_delta_2,Visa_cconsumos_delta_1,Visa_cconsumos_delta_2,Visa_cadelantosefectivo_delta_1,Visa_cadelantosefectivo_delta_2,Visa_mpagominimo_delta_1,Visa_mpagominimo_delta_2
i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,i64,f64,f64,f64,i64,f64,i64,i64,f64,i64,i64,f64,i64,i64,f64,i64,f64,i64,f64,i64,…,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,f64,i64,i64,i64,i64,f64,f64
249320580,202101,1,0,0,63,279,508.8,17140.27,896.11,0.0,-330.19,8,1,1,0.0,0.0,4,43297.51,0.0,100.35,1,77358.39,2,9,27877.92,1,0,0.0,1,0,0.0,0,0.0,0,0.0,0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
249320580,202102,1,0,0,63,280,2494.79,18472.46,2015.94,0.66,407.57,8,1,1,0.0,-31.07,4,34954.32,0.0,103.25,1,80323.91,2,9,30659.34,1,0,0.0,1,0,0.0,0,0.0,0,0.0,0,…,,0.0,,0.0,,0.0,,0.0,,0.0,,,,,,0.0,,,,,,0.0,,,,,,28.0,,,,,,,,0.0,
249320580,202103,1,0,0,63,281,3207.66,19030.88,2519.75,0.0,586.45,8,1,1,0.0,0.0,4,44960.66,0.0,105.74,1,69725.36,2,13,41845.43,1,0,0.0,1,0,0.0,0,0.0,0,0.0,0,…,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,,,,,0.0,0.0,,,,,31.0,59.0,,,,,,,0.0,0.0
249320580,202104,1,0,0,64,282,3122.15,19147.05,2472.63,0.0,553.72,8,1,1,0.0,0.0,4,43728.23,0.0,107.54,1,91103.07,2,11,34122.88,1,0,0.0,1,0,0.0,0,0.0,0,0.0,0,…,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,,,,,0.0,0.0,,,,,30.0,61.0,,,,,,,0.0,0.0
249320580,202105,1,0,0,64,283,3455.86,20443.48,2520.36,0.0,797.53,8,1,1,0.0,0.0,4,60910.65,0.0,108.84,1,95203.09,2,11,45436.8,1,0,0.0,1,0,0.0,0,0.0,0,0.0,0,…,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,,,,,0.0,0.0,,,,,31.0,61.0,,,,,,,0.0,0.0


In [8]:
#cuanto pesa en gigas df
df.estimated_size() / (1024 ** 3)

3.896825742907822

In [9]:

df_train = df.filter(
    (pl.col("foto_mes") == 202101) | 
    (pl.col("foto_mes") == 202102) | 
    (pl.col("foto_mes") == 202103)
)


In [10]:
#guardar df en csv
df_train.write_csv("data/competencia_01_02_03_dl.csv")

In [11]:
df_test = df.filter(
    (pl.col("foto_mes") == 202104) 
)

In [12]:
#guardar df en csv
df_test.write_csv("data/competencia_04_dl.csv")

In [13]:
df_kaggle = df.filter(
    (pl.col("foto_mes") == 202106) 
)

In [14]:
#guardar df en csv
df_kaggle.write_csv("data/competencia_06_dl.csv")

In [15]:
df_kaggle.shape

(164313, 530)