#Clase ternaria

In [1]:
import os, sys
print(os.getcwd())   # dónde está parado el notebook
print(sys.path)      # lista de rutas donde busca módulos
print(os.listdir())  # qué archivos hay en el directorio actual
import sys, os
import polars as pl
import duckdb 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

c:\Users\Flor\Documents\UBA\DMEyF
['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\python311.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Lib', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv', '', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\win32', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\Flor\\Documents\\UBA\\DMEyF\\venv\\Lib\\site-packages\\Pythonwin']
['.git', '.gitignore', 'Clase_ternaria.py', 'data', 'Experimentos 2.ipynb', 'Experimentos.ipynb', 'FeatureEng_video.ipynb', 'logs', 'main.py', 'modelo.ipynb', 'Optimizacion 

#Feature Engeneering

In [2]:
def col_selection(df: pl.DataFrame) -> tuple[list[str], list[list[str]]]:
    # Columns to drop
    col_drops = {
        "numero_de_cliente", "foto_mes", "active_quarter", "clase_ternaria",
        "cliente_edad", "cliente_antiguedad",
        "Visa_fultimo_cierre", "Master_fultimo_cierre",
        "Visa_Fvencimiento", "Master_Fvencimiento"
    }

    # --- Categorical vs Numerical ---
    cat_cols = []
    num_cols = []
    for c in df.columns:
        if c in col_drops:
            continue
        # Check data type before checking unique values
        if df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
            nunique = df.select(pl.col(c).n_unique()).item()
            if nunique <= 5:
                cat_cols.append(c)
            else:
                num_cols.append(c)

    # --- Prefix-based splits ---
    lista_t = [c for c in df.columns if c.startswith("t") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_c = [c for c in df.columns if c.startswith("c") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_m = [c for c in df.columns if c.startswith("m") and c not in col_drops and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]
    lista_r = [c for c in df.columns if c not in (lista_t + lista_c + lista_m + list(col_drops)) and df[c].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]]


    # --- Features for lags, deltas, max/min, regression ---
    cols_lag_delta_max_min_regl = num_cols # Use only the identified numeric columns


    # --- Ratios: match c-columns with m-columns (same suffix) ---
    cols_ratios = []
    for c in lista_c:
        suffix = c[1:]
        match = next((m for m in lista_m if m[1:] == suffix), None)
        if match:
            cols_ratios.append([match, c])

    return cols_lag_delta_max_min_regl, cols_ratios

def run_duckdb_query(df: pl.DataFrame, sql: str) -> pl.DataFrame:
    """Executes a DuckDB SQL query over a DataFrame and returns the result."""
    with duckdb.connect(database=":memory:") as con:
        con.register("df", df)
        result = con.execute(sql).pl()
    return result

def feature_engineering_pipeline(df: pl.DataFrame, config: dict) -> pl.DataFrame:
    """
    Ejecuta el pipeline de feature engineering completo

    Parameters:
    -----------
    data_path : str
        Ruta al archivo de datos
    config : dict
        Configuración del pipeline. Ejemplo:

        "lag": {
            "columns": ["col1", "col2"],
            "n": 2   # number of lags
        },
        "delta": {
            "columns": ["col1", "col2"],
            "n": 2   # number of deltas
        },
        "minmax": {
            "columns": ["col1", "col2"]
        },
        "ratio": {
            "pairs": [["monto", "cantidad"], ["ingresos", "clientes"]]
        },
        "linreg": {
            "columns": ["col1"],
            "window": 3  # optional, for flexibility
        }

    Returns:
    --------
    pl.DataFrame
        DataFrame con las nuevas features agregadas
    """

    sql = "SELECT *"

    window_clause = ""

    if "lag" in config:
        sql += add_lag_sql(config["lag"])

    if "delta" in config:
        sql += add_delta_sql(config["delta"])

    if "minmax" in config:
        sql += add_minmax_sql(config["minmax"])

    if "ratio" in config:
        sql += add_ratio_sql(config["ratio"])

    if "linreg" in config:
        linreg_str, window_clause = add_linreg_sql(config["linreg"])
        sql += linreg_str

    sql += " FROM df"
    if window_clause != "":
        sql += window_clause

    df = run_duckdb_query(df, sql)

    return df

def add_lag_sql(config_lag: dict) -> str:
    lag_str = ""
    for col in config_lag["columns"]:
        lag_str += f", lag({col}, {config_lag['n']}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS {col}_lag_{config_lag['n']}"

    return lag_str

def add_delta_sql(config_delta: dict) -> str:
    delta_str = ""
    for col in config_delta["columns"]:
        for i in range(1, config_delta["n"] + 1):
             delta_str += f", {col} - lag({col}, {i}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS {col}_delta_{i}"
    return delta_str

def add_minmax_sql(config_minmax: dict) -> str:
    min_max_sql = ""
    for col in config_minmax["columns"]:
        min_max_sql += f", MAX({col}) OVER (PARTITION BY numero_de_cliente) AS {col}_MAX, MIN({col}) OVER (PARTITION BY numero_de_cliente) AS {col}_MIN"

    return min_max_sql

def add_ratio_sql(config_ratio: dict) -> str:
    ratio_sql = ""
    for pair in config_ratio["pairs"]:
        ratio_sql += f", IF({pair[1]} = 0, 0, {pair[0]} / {pair[1]}) AS ratio_{pair[0]}_{pair[1]}"

    return ratio_sql

def add_linreg_sql(config_linreg: dict) -> tuple:
    linreg_sql = ""
    window_size = config_linreg.get("window", 3)
    for col in config_linreg["columns"]:
        linreg_sql += f", REGR_SLOPE({col}, cliente_antiguedad) OVER ventana_{window_size} AS slope_{col}"

    window_clause = f" WINDOW ventana_{window_size} AS (PARTITION BY numero_de_cliente ORDER BY foto_mes ROWS BETWEEN {window_size} PRECEDING AND CURRENT ROW)"

    return linreg_sql, window_clause

In [3]:
# Supongamos que el archivo se llama "datos.csv"
df = pl.read_csv("data/competencia_01_con_clase_ternaria.csv")

In [4]:
cols_lag_delta_max_min_regl, cols_ratios = col_selection(df)

In [5]:
# 2. Feature Engineering
df = feature_engineering_pipeline(df, {
  "lag": {
    "columns": cols_lag_delta_max_min_regl,
    "n": 2
  },
  "delta": {
    "columns": cols_lag_delta_max_min_regl,
    "n": 2
  },
   "minmax": {
       "columns": cols_lag_delta_max_min_regl
   },
  "ratio": {
    "pairs": cols_ratios
  },
#   "linreg": {
#     "columns": cols_lag_delta_max_min_regl,
#     "window": 3
#   }
})

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
df.head()

numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,mactivos_margen,mpasivos_margen,cproductos,tcuentas,ccuenta_corriente,mcuenta_corriente_adicional,mcuenta_corriente,ccaja_ahorro,mcaja_ahorro,mcaja_ahorro_adicional,mcaja_ahorro_dolares,cdescubierto_preacordado,mcuentas_saldo,ctarjeta_debito,ctarjeta_debito_transacciones,mautoservicio,ctarjeta_visa,ctarjeta_visa_transacciones,mtarjeta_visa_consumo,ctarjeta_master,ctarjeta_master_transacciones,mtarjeta_master_consumo,cprestamos_personales,mprestamos_personales,cprestamos_prendarios,mprestamos_prendarios,cprestamos_hipotecarios,…,Visa_mpagospesos_MAX,Visa_mpagospesos_MIN,Visa_mpagosdolares_MAX,Visa_mpagosdolares_MIN,Visa_fechaalta_MAX,Visa_fechaalta_MIN,Visa_mconsumototal_MAX,Visa_mconsumototal_MIN,Visa_cconsumos_MAX,Visa_cconsumos_MIN,Visa_cadelantosefectivo_MAX,Visa_cadelantosefectivo_MIN,Visa_mpagominimo_MAX,Visa_mpagominimo_MIN,ratio_mcuenta_corriente_ccuenta_corriente,ratio_mcaja_ahorro_ccaja_ahorro,ratio_mprestamos_personales_cprestamos_personales,ratio_mprestamos_prendarios_cprestamos_prendarios,ratio_mprestamos_hipotecarios_cprestamos_hipotecarios,ratio_minversion2_cinversion2,ratio_mcuenta_debitos_automaticos_ccuenta_debitos_automaticos,ratio_mpagodeservicios_cpagodeservicios,ratio_mpagomiscuentas_cpagomiscuentas,ratio_mcajeros_propios_descuentos_ccajeros_propios_descuentos,ratio_mtarjeta_visa_descuentos_ctarjeta_visa_descuentos,ratio_mtarjeta_master_descuentos_ctarjeta_master_descuentos,ratio_mcomisiones_mantenimiento_ccomisiones_mantenimiento,ratio_mcomisiones_otras_ccomisiones_otras,ratio_mforex_buy_cforex_buy,ratio_mforex_sell_cforex_sell,ratio_mtransferencias_recibidas_ctransferencias_recibidas,ratio_mtransferencias_emitidas_ctransferencias_emitidas,ratio_mextraccion_autoservicio_cextraccion_autoservicio,ratio_mcheques_depositados_ccheques_depositados,ratio_mcheques_emitidos_ccheques_emitidos,ratio_mcheques_depositados_rechazados_ccheques_depositados_rechazados,ratio_mcheques_emitidos_rechazados_ccheques_emitidos_rechazados
i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,i64,f64,f64,f64,i64,f64,i64,i64,f64,i64,i64,f64,i64,i64,f64,i64,f64,i64,f64,i64,…,f64,f64,f64,f64,i64,i64,f64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
433987585,202101,1,0,0,55,213,1744.54,13880.46,1628.24,-244.28,343.43,7,1,1,0.0,0.0,2,15086.53,0.0,20480.18,1,25040.88,2,0,0.0,1,11,10339.17,1,5,10433.07,0,0.0,0,0.0,0,…,-6965.93,-15422.9,0.0,0.0,9024,8874,13931.87,950.02,5,4,0,0,10334.13,1348.95,0.0,7543.265,0.0,0.0,0.0,0.0,11601.846667,0.0,0.0,0.0,0.0,0.0,1622.81,162.824,0.0,0.0,878.967778,9878.7125,0.0,0.0,0.0,0.0,0.0
433987585,202102,1,0,0,55,214,1985.94,15161.1,1570.57,108.37,245.74,7,1,1,0.0,0.0,2,13032.05,0.0,21072.94,1,29252.33,2,0,0.0,1,10,8827.78,1,5,14532.15,0,0.0,0,0.0,0,…,-6965.93,-15422.9,0.0,0.0,9024,8874,13931.87,950.02,5,4,0,0,10334.13,1348.95,0.0,6516.025,0.0,0.0,0.0,0.0,11601.846667,0.0,0.0,0.0,0.0,0.0,1622.81,157.057,0.0,0.0,1832.223333,175.95,0.0,0.0,0.0,0.0,0.0
433987585,202103,1,0,0,55,215,2153.47,16605.53,1428.76,106.8,511.03,7,1,1,0.0,0.0,2,30634.4,0.0,21579.68,1,58428.45,2,0,0.0,1,10,9870.7,1,5,14596.67,0,0.0,0,0.0,0,…,-6965.93,-15422.9,0.0,0.0,9024,8874,13931.87,950.02,5,4,0,0,10334.13,1348.95,0.0,15317.2,0.0,0.0,0.0,0.0,12007.086667,0.0,0.0,0.0,0.0,0.0,1622.81,142.876,0.0,0.0,13196.25,0.0,0.0,0.0,0.0,0.0,0.0
433987585,202104,1,0,0,55,216,2086.96,17692.74,1379.7,190.94,412.01,7,1,1,0.0,0.0,2,20291.57,0.0,21948.0,1,80860.8,2,0,0.0,1,10,15240.02,1,6,16551.32,0,0.0,0,0.0,0,…,-6965.93,-15422.9,0.0,0.0,9024,8874,13931.87,950.02,5,4,0,0,10334.13,1348.95,0.0,10145.785,0.0,0.0,0.0,0.0,12007.086667,0.0,304.98,0.0,0.0,0.0,1622.81,114.975,0.0,0.0,4692.0,1957.345,0.0,0.0,0.0,0.0,0.0
433987585,202105,1,0,0,55,217,2787.97,19227.0,1619.97,-306.89,1302.63,7,1,1,0.0,0.0,2,82734.23,0.0,22213.1,1,178007.73,2,0,0.0,1,8,6883.32,1,7,17791.44,0,0.0,0,0.0,0,…,-6965.93,-15422.9,0.0,0.0,9024,8874,13931.87,950.02,5,4,0,0,10334.13,1348.95,0.0,41367.115,0.0,0.0,0.0,0.0,13774.353333,0.0,300.325,0.0,0.0,0.0,1622.81,115.712143,0.0,0.0,66861.0,4875.77,0.0,0.0,0.0,0.0,0.0


In [7]:
#cuanto pesa en gigas df
df.estimated_size() / (1024 ** 3)

5.89540798868984

In [None]:
# Filtrar los meses deseados
df_train = df[
    (df["foto_mes"] == "202101") | (df["foto_mes"] == "202102") | (df["foto_mes"] == "202103")
]


In [None]:
#guardar df en csv
df_train.write_csv("data/competencia_01_02_03_slope.csv")