# Feature Engineering en SQL

A continuación, veremos cómo calcular diferentes variables para el feature engineering utilizando SQL.


In [2]:
%pip install duckdb
%pip install jupysql
%pip install duckdb-engine

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



In [1]:
import duckdb
import pandas as pd

%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:
%reload_ext sql

In [2]:
dataset_path = 'C:/Users/Federico/Desktop/Maestria Data mining/DM EyF/datasets/'
dataset_file = 'competencia_01.csv'

In [3]:
%%sql
create or replace table competencia_01 as
select
    *
from read_csv_auto("{{dataset_path + dataset_file}}")

Unnamed: 0,Success


In [4]:
df=pd.read_csv(dataset_path + dataset_file)

In [5]:
campos=df.columns.to_list()
campos

['numero_de_cliente',
 'foto_mes',
 'active_quarter',
 'cliente_vip',
 'internet',
 'cliente_edad',
 'cliente_antiguedad',
 'mrentabilidad',
 'mrentabilidad_annual',
 'mcomisiones',
 'mactivos_margen',
 'mpasivos_margen',
 'cproductos',
 'tcuentas',
 'ccuenta_corriente',
 'mcuenta_corriente_adicional',
 'mcuenta_corriente',
 'ccaja_ahorro',
 'mcaja_ahorro',
 'mcaja_ahorro_adicional',
 'mcaja_ahorro_dolares',
 'cdescubierto_preacordado',
 'mcuentas_saldo',
 'ctarjeta_debito',
 'ctarjeta_debito_transacciones',
 'mautoservicio',
 'ctarjeta_visa',
 'ctarjeta_visa_transacciones',
 'mtarjeta_visa_consumo',
 'ctarjeta_master',
 'ctarjeta_master_transacciones',
 'mtarjeta_master_consumo',
 'cprestamos_personales',
 'mprestamos_personales',
 'cprestamos_prendarios',
 'mprestamos_prendarios',
 'cprestamos_hipotecarios',
 'mprestamos_hipotecarios',
 'cplazo_fijo',
 'mplazo_fijo_dolares',
 'mplazo_fijo_pesos',
 'cinversion1',
 'minversion1_pesos',
 'minversion1_dolares',
 'cinversion2',
 'minversi

Ahora creo una tabla con las nuevas columnas, tengo que hacer la macro por que estoy dividiendo por 0.. 

In [7]:
%%sql
CREATE OR REPLACE MACRO suma_sin_null(a, b) AS ifnull(a, 0) + ifnull(b, 0)

Unnamed: 0,Success


In [8]:
%%sql
CREATE OR REPLACE MACRO div_sin_cero(a, b) as
    case
        when ifnull(b, 0) = 0 THEN NULL
        else ifnull(a, 0) / ifnull(b, 1)
    end

Unnamed: 0,Success


In [None]:
campos

# Separar las columnas por tipo
master = [col for col in campos if 'Master' in col]
visa = [col for col in campos if 'Visa' in col]

sumas = []

for m_col, v_col in zip(master, visa):
    # Crear la suma usando la función 'suma_sin_null'
    suma = f"suma_sin_null({m_col}, {v_col}) as {m_col.replace('Master_', '')}_total"
    sumas.append(suma)

consulta_sql = f"""
select 
    numero_de_cliente,
    {', '.join(sumas)}
from competencia_01;
"""

print(consulta_sql)

In [35]:
%%sql
create or replace table competencia_01_feature_new as
select
    *,
    ifnull(cliente_antiguedad, 0) as cliente_antiguedad_no_null,
    ifnull(mrentabilidad, 0) as mrentabilidad_no_null,
    ifnull(mrentabilidad_annual, 0) as mrentabilidad_annual_no_null,
    case
        when cliente_antiguedad_no_null > 11 then 
            cast(mrentabilidad_no_null > (mrentabilidad_annual_no_null / 12) as integer)
        else 
            cast(mrentabilidad_no_null > (mrentabilidad_annual_no_null / cliente_antiguedad_no_null) as integer)
    end as es_rentable,
    suma_sin_null(Master_delinquency, Visa_delinquency) as delinquency_total,
    suma_sin_null(Master_status, Visa_status) as status_total,
    suma_sin_null(Master_mfinanciacion_limite, Visa_mfinanciacion_limite) as mfinanciacion_limite_total,
    suma_sin_null(Master_Fvencimiento, Visa_Fvencimiento) as Fvencimiento_total,
    suma_sin_null(Master_Finiciomora, Visa_Finiciomora) as Finiciomora_total,
    suma_sin_null(Master_msaldototal, Visa_msaldototal) as msaldototal_total,
    suma_sin_null(Master_msaldopesos, Visa_msaldopesos) as msaldopesos_total,
    suma_sin_null(Master_msaldodolares, Visa_msaldodolares) as msaldodolares_total,
    suma_sin_null(Master_mconsumospesos, Visa_mconsumospesos) as mconsumospesos_total,
    suma_sin_null(Master_mconsumosdolares, Visa_mconsumosdolares) as mconsumosdolares_total,
    suma_sin_null(Master_mlimitecompra, Visa_mlimitecompra) as mlimitecompra_total,
    suma_sin_null(Master_madelantopesos, Visa_madelantopesos) as madelantopesos_total,
    suma_sin_null(Master_madelantodolares, Visa_madelantodolares) as madelantodolares_total,
    suma_sin_null(Master_fultimo_cierre, Visa_fultimo_cierre) as fultimo_cierre_total,
    suma_sin_null(Master_mpagado, Visa_mpagado) as mpagado_total,
    suma_sin_null(Master_mpagospesos, Visa_mpagospesos) as mpagospesos_total,
    suma_sin_null(Master_mpagosdolares, Visa_mpagosdolares) as mpagosdolares_total,
    suma_sin_null(Master_fechaalta, Visa_fechaalta) as fechaalta_total,
    suma_sin_null(Master_mconsumototal, Visa_mconsumototal) as mconsumototal_total,
    suma_sin_null(Master_cconsumos, Visa_cconsumos) as cconsumos_total,
    suma_sin_null(Master_cadelantosefectivo, Visa_cadelantosefectivo) as cadelantosefectivo_total,
    suma_sin_null(Master_mpagominimo, Visa_mpagominimo) as mpagominimo_total,
    
    suma_sin_null(ctarjeta_visa, ctarjeta_master) as cant_tarjetas,
    suma_sin_null(ctarjeta_visa_transacciones, ctarjeta_master_transacciones) as cant_transac_tarjetas,
    suma_sin_null(mtarjeta_visa_consumo, mtarjeta_master_consumo) as mtarjetas_consumo,
    suma_sin_null(cprestamos_personales,cprestamos_prendarios) as c_prestamos,
    suma_sin_null(mpayroll, mpayroll2) as m_payroll_total, -- monto acreditacion
    suma_sin_null(cpayroll_trx, cpayroll2_trx) as m_cpayroll_total, -- cantidad de acreditaciones
    suma_sin_null(minversion1_dolares, minversion1_pesos) as minversion1_total,
    suma_sin_null(mplazo_fijo_dolares, mplazo_fijo_pesos) as mplazofijo_total,
    suma_sin_null(suma_sin_null(cseguro_vida, cseguro_auto), suma_sin_null(cseguro_vivienda, cseguro_accidentes_personales)) as cseguros_total,

    div_sin_cero(mautoservicio, ctarjeta_debito_transacciones) as m_transaccion, 
    (ifnull(cliente_antiguedad, 0) / 12.0) as cliente_antiguedad_años,
    div_sin_cero(cliente_antiguedad_años, cliente_edad)  as antiguedad_edad,
    div_sin_cero(mconsumototal_total, mlimitecompra_total) as ratio_uso_credito,
    div_sin_cero(mpagado_total, mconsumototal_total) as ratio_pago_vs_consumo,
    div_sin_cero(mpagominimo_total, mpagado_total) as ratio_pago_min_vs_total,
    div_sin_cero(cant_transac_tarjetas, cant_tarjetas) as transacciones_por_tarjeta,
    div_sin_cero(mconsumospesos_total, mconsumosdolares_total) as ratio_consumo_pesos_dolares,
    div_sin_cero(madelantopesos_total + madelantodolares_total, mlimitecompra_total) as ratio_adelantos_vs_limite,
    div_sin_cero(c_prestamos, m_payroll_total) as ratio_prestamos_vs_acreditaciones,
    div_sin_cero(cseguros_total, mconsumototal_total) as ratio_seguros_vs_consumo,
    div_sin_cero(minversion1_total, msaldototal_total) as ratio_inversiones_vs_saldo
from competencia_01


Unnamed: 0,Success


lag y delta para algunas variables

In [43]:
campos = ["ctrx_quarter",
    'mfinanciacion_limite_total', 
    'mconsumototal_total',
      'mconsumospesos_total',
        'mprestamos_personales',
        'mconsumosdolares_total',
          'mpagado_total', 'mpagospesos_total','mcuentas_saldo','mcaja_ahorro',
    'mpagosdolares_total', 'mpasivos_margen','madelantopesos_total', 'madelantodolares_total', 
    'ratio_uso_credito', 'ratio_pago_vs_consumo', 'ratio_pago_min_vs_total', 
    'ratio_adelantos_vs_limite', 'mplazofijo_total',"m_cpayroll_total", 'c_prestamos', 
    'm_payroll_total', 'ratio_inversiones_vs_saldo', 'minversion1_total', 'mcomisiones_mantenimiento',
    'mpagominimo_total', 'cseguros_total', 'cant_transac_tarjetas','cproductos','cdescubierto_preacordado','msaldopesos_total',
    'transacciones_por_tarjeta', 'antiguedad_edad','mrentabilidad_annual_no_null','mrentabilidad_no_null','mtarjetas_consumo','msaldodolares_total'
]

In [44]:
nuevos_features = ""
for campo in campos:
  nuevos_features += f"\n, lag({campo}, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_{campo}"
  nuevos_features += f"\n, {campo} - lag_1_{campo} as delta_1_{campo}"
  nuevos_features += f"\n, lag({campo}, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_{campo}"
  nuevos_features += f"\n, {campo} - lag_2_{campo} as delta_2_{campo}"
  nuevos_features += f"\n, lag({campo}, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_{campo}"
  nuevos_features += f"\n, {campo} - lag_3_{campo} as delta_3_{campo}"


In [45]:
print(nuevos_features)


, lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
, ctrx_quarter - lag_1_ctrx_quarter as delta_1_ctrx_quarter
, lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
, ctrx_quarter - lag_2_ctrx_quarter as delta_2_ctrx_quarter
, lag(ctrx_quarter, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ctrx_quarter
, ctrx_quarter - lag_3_ctrx_quarter as delta_3_ctrx_quarter
, lag(mfinanciacion_limite_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mfinanciacion_limite_total
, mfinanciacion_limite_total - lag_1_mfinanciacion_limite_total as delta_1_mfinanciacion_limite_total
, lag(mfinanciacion_limite_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mfinanciacion_limite_total
, mfinanciacion_limite_total - lag_2_mfinanciacion_limite_total as delta_2_mfinanciacion_limite_total
, lag(mfinanciacion_limite_total, 3) over (partition b

In [46]:
%%sql
create or replace table competencia_01_feature_new as
select
    *
    , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
    , ctrx_quarter - lag_1_ctrx_quarter as delta_1_ctrx_quarter
    , lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
    , ctrx_quarter - lag_2_ctrx_quarter as delta_2_ctrx_quarter
    , lag(ctrx_quarter, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ctrx_quarter
    , ctrx_quarter - lag_3_ctrx_quarter as delta_3_ctrx_quarter
    , lag(mfinanciacion_limite_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mfinanciacion_limite_total
    , mfinanciacion_limite_total - lag_1_mfinanciacion_limite_total as delta_1_mfinanciacion_limite_total
    , lag(mfinanciacion_limite_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mfinanciacion_limite_total
    , mfinanciacion_limite_total - lag_2_mfinanciacion_limite_total as delta_2_mfinanciacion_limite_total
    , lag(mfinanciacion_limite_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mfinanciacion_limite_total
    , mfinanciacion_limite_total - lag_3_mfinanciacion_limite_total as delta_3_mfinanciacion_limite_total
    , lag(mconsumototal_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mconsumototal_total
    , mconsumototal_total - lag_1_mconsumototal_total as delta_1_mconsumototal_total
    , lag(mconsumototal_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mconsumototal_total
    , mconsumototal_total - lag_2_mconsumototal_total as delta_2_mconsumototal_total
    , lag(mconsumototal_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mconsumototal_total
    , mconsumototal_total - lag_3_mconsumototal_total as delta_3_mconsumototal_total
    , lag(mconsumospesos_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mconsumospesos_total
    , mconsumospesos_total - lag_1_mconsumospesos_total as delta_1_mconsumospesos_total
    , lag(mconsumospesos_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mconsumospesos_total
    , mconsumospesos_total - lag_2_mconsumospesos_total as delta_2_mconsumospesos_total
    , lag(mconsumospesos_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mconsumospesos_total
    , mconsumospesos_total - lag_3_mconsumospesos_total as delta_3_mconsumospesos_total
    , lag(mprestamos_personales, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mprestamos_personales
    , mprestamos_personales - lag_1_mprestamos_personales as delta_1_mprestamos_personales
    , lag(mprestamos_personales, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mprestamos_personales
    , mprestamos_personales - lag_2_mprestamos_personales as delta_2_mprestamos_personales
    , lag(mprestamos_personales, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mprestamos_personales
    , mprestamos_personales - lag_3_mprestamos_personales as delta_3_mprestamos_personales
    , lag(mconsumosdolares_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mconsumosdolares_total
    , mconsumosdolares_total - lag_1_mconsumosdolares_total as delta_1_mconsumosdolares_total
    , lag(mconsumosdolares_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mconsumosdolares_total
    , mconsumosdolares_total - lag_2_mconsumosdolares_total as delta_2_mconsumosdolares_total
    , lag(mconsumosdolares_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mconsumosdolares_total
    , mconsumosdolares_total - lag_3_mconsumosdolares_total as delta_3_mconsumosdolares_total
    , lag(mpagado_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpagado_total
    , mpagado_total - lag_1_mpagado_total as delta_1_mpagado_total
    , lag(mpagado_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpagado_total
    , mpagado_total - lag_2_mpagado_total as delta_2_mpagado_total
    , lag(mpagado_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mpagado_total
    , mpagado_total - lag_3_mpagado_total as delta_3_mpagado_total
    , lag(mpagospesos_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpagospesos_total
    , mpagospesos_total - lag_1_mpagospesos_total as delta_1_mpagospesos_total
    , lag(mpagospesos_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpagospesos_total
    , mpagospesos_total - lag_2_mpagospesos_total as delta_2_mpagospesos_total
    , lag(mpagospesos_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mpagospesos_total
    , mpagospesos_total - lag_3_mpagospesos_total as delta_3_mpagospesos_total
    , lag(mcuentas_saldo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcuentas_saldo
    , mcuentas_saldo - lag_1_mcuentas_saldo as delta_1_mcuentas_saldo
    , lag(mcuentas_saldo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcuentas_saldo
    , mcuentas_saldo - lag_2_mcuentas_saldo as delta_2_mcuentas_saldo
    , lag(mcuentas_saldo, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mcuentas_saldo
    , mcuentas_saldo - lag_3_mcuentas_saldo as delta_3_mcuentas_saldo
    , lag(mcaja_ahorro, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcaja_ahorro
    , mcaja_ahorro - lag_1_mcaja_ahorro as delta_1_mcaja_ahorro
    , lag(mcaja_ahorro, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcaja_ahorro
    , mcaja_ahorro - lag_2_mcaja_ahorro as delta_2_mcaja_ahorro
    , lag(mcaja_ahorro, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mcaja_ahorro
    , mcaja_ahorro - lag_3_mcaja_ahorro as delta_3_mcaja_ahorro
    , lag(mpagosdolares_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpagosdolares_total
    , mpagosdolares_total - lag_1_mpagosdolares_total as delta_1_mpagosdolares_total
    , lag(mpagosdolares_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpagosdolares_total
    , mpagosdolares_total - lag_2_mpagosdolares_total as delta_2_mpagosdolares_total
    , lag(mpagosdolares_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mpagosdolares_total
    , mpagosdolares_total - lag_3_mpagosdolares_total as delta_3_mpagosdolares_total
    , lag(mpasivos_margen, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpasivos_margen
    , mpasivos_margen - lag_1_mpasivos_margen as delta_1_mpasivos_margen
    , lag(mpasivos_margen, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpasivos_margen
    , mpasivos_margen - lag_2_mpasivos_margen as delta_2_mpasivos_margen
    , lag(mpasivos_margen, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mpasivos_margen
    , mpasivos_margen - lag_3_mpasivos_margen as delta_3_mpasivos_margen
    , lag(madelantopesos_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_madelantopesos_total
    , madelantopesos_total - lag_1_madelantopesos_total as delta_1_madelantopesos_total
    , lag(madelantopesos_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_madelantopesos_total
    , madelantopesos_total - lag_2_madelantopesos_total as delta_2_madelantopesos_total
    , lag(madelantopesos_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_madelantopesos_total
    , madelantopesos_total - lag_3_madelantopesos_total as delta_3_madelantopesos_total
    , lag(madelantodolares_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_madelantodolares_total
    , madelantodolares_total - lag_1_madelantodolares_total as delta_1_madelantodolares_total
    , lag(madelantodolares_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_madelantodolares_total
    , madelantodolares_total - lag_2_madelantodolares_total as delta_2_madelantodolares_total
    , lag(madelantodolares_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_madelantodolares_total
    , madelantodolares_total - lag_3_madelantodolares_total as delta_3_madelantodolares_total
    , lag(ratio_uso_credito, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ratio_uso_credito
    , ratio_uso_credito - lag_1_ratio_uso_credito as delta_1_ratio_uso_credito
    , lag(ratio_uso_credito, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ratio_uso_credito
    , ratio_uso_credito - lag_2_ratio_uso_credito as delta_2_ratio_uso_credito
    , lag(ratio_uso_credito, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ratio_uso_credito
    , ratio_uso_credito - lag_3_ratio_uso_credito as delta_3_ratio_uso_credito
    , lag(ratio_pago_vs_consumo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ratio_pago_vs_consumo
    , ratio_pago_vs_consumo - lag_1_ratio_pago_vs_consumo as delta_1_ratio_pago_vs_consumo
    , lag(ratio_pago_vs_consumo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ratio_pago_vs_consumo
    , ratio_pago_vs_consumo - lag_2_ratio_pago_vs_consumo as delta_2_ratio_pago_vs_consumo
    , lag(ratio_pago_vs_consumo, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ratio_pago_vs_consumo
    , ratio_pago_vs_consumo - lag_3_ratio_pago_vs_consumo as delta_3_ratio_pago_vs_consumo
    , lag(ratio_pago_min_vs_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ratio_pago_min_vs_total
    , ratio_pago_min_vs_total - lag_1_ratio_pago_min_vs_total as delta_1_ratio_pago_min_vs_total
    , lag(ratio_pago_min_vs_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ratio_pago_min_vs_total
    , ratio_pago_min_vs_total - lag_2_ratio_pago_min_vs_total as delta_2_ratio_pago_min_vs_total
    , lag(ratio_pago_min_vs_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ratio_pago_min_vs_total
    , ratio_pago_min_vs_total - lag_3_ratio_pago_min_vs_total as delta_3_ratio_pago_min_vs_total
    , lag(ratio_adelantos_vs_limite, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ratio_adelantos_vs_limite
    , ratio_adelantos_vs_limite - lag_1_ratio_adelantos_vs_limite as delta_1_ratio_adelantos_vs_limite
    , lag(ratio_adelantos_vs_limite, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ratio_adelantos_vs_limite
    , ratio_adelantos_vs_limite - lag_2_ratio_adelantos_vs_limite as delta_2_ratio_adelantos_vs_limite
    , lag(ratio_adelantos_vs_limite, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ratio_adelantos_vs_limite
    , ratio_adelantos_vs_limite - lag_3_ratio_adelantos_vs_limite as delta_3_ratio_adelantos_vs_limite
    , lag(mplazofijo_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mplazofijo_total
    , mplazofijo_total - lag_1_mplazofijo_total as delta_1_mplazofijo_total
    , lag(mplazofijo_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mplazofijo_total
    , mplazofijo_total - lag_2_mplazofijo_total as delta_2_mplazofijo_total
    , lag(mplazofijo_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mplazofijo_total
    , mplazofijo_total - lag_3_mplazofijo_total as delta_3_mplazofijo_total
    , lag(m_cpayroll_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_m_cpayroll_total
    , m_cpayroll_total - lag_1_m_cpayroll_total as delta_1_m_cpayroll_total
    , lag(m_cpayroll_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_m_cpayroll_total
    , m_cpayroll_total - lag_2_m_cpayroll_total as delta_2_m_cpayroll_total
    , lag(m_cpayroll_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_m_cpayroll_total
    , m_cpayroll_total - lag_3_m_cpayroll_total as delta_3_m_cpayroll_total
    , lag(c_prestamos, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_c_prestamos
    , c_prestamos - lag_1_c_prestamos as delta_1_c_prestamos
    , lag(c_prestamos, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_c_prestamos
    , c_prestamos - lag_2_c_prestamos as delta_2_c_prestamos
    , lag(c_prestamos, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_c_prestamos
    , c_prestamos - lag_3_c_prestamos as delta_3_c_prestamos
    , lag(m_payroll_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_m_payroll_total
    , m_payroll_total - lag_1_m_payroll_total as delta_1_m_payroll_total
    , lag(m_payroll_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_m_payroll_total
    , m_payroll_total - lag_2_m_payroll_total as delta_2_m_payroll_total
    , lag(m_payroll_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_m_payroll_total
    , m_payroll_total - lag_3_m_payroll_total as delta_3_m_payroll_total
    , lag(ratio_inversiones_vs_saldo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ratio_inversiones_vs_saldo
    , ratio_inversiones_vs_saldo - lag_1_ratio_inversiones_vs_saldo as delta_1_ratio_inversiones_vs_saldo
    , lag(ratio_inversiones_vs_saldo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ratio_inversiones_vs_saldo
    , ratio_inversiones_vs_saldo - lag_2_ratio_inversiones_vs_saldo as delta_2_ratio_inversiones_vs_saldo
    , lag(ratio_inversiones_vs_saldo, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ratio_inversiones_vs_saldo
    , ratio_inversiones_vs_saldo - lag_3_ratio_inversiones_vs_saldo as delta_3_ratio_inversiones_vs_saldo
    , lag(minversion1_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_minversion1_total
    , minversion1_total - lag_1_minversion1_total as delta_1_minversion1_total
    , lag(minversion1_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_minversion1_total
    , minversion1_total - lag_2_minversion1_total as delta_2_minversion1_total
    , lag(minversion1_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_minversion1_total
    , minversion1_total - lag_3_minversion1_total as delta_3_minversion1_total
    , lag(mcomisiones_mantenimiento, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcomisiones_mantenimiento
    , mcomisiones_mantenimiento - lag_1_mcomisiones_mantenimiento as delta_1_mcomisiones_mantenimiento
    , lag(mcomisiones_mantenimiento, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcomisiones_mantenimiento
    , mcomisiones_mantenimiento - lag_2_mcomisiones_mantenimiento as delta_2_mcomisiones_mantenimiento
    , lag(mcomisiones_mantenimiento, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mcomisiones_mantenimiento
    , mcomisiones_mantenimiento - lag_3_mcomisiones_mantenimiento as delta_3_mcomisiones_mantenimiento
    , lag(mpagominimo_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpagominimo_total
    , mpagominimo_total - lag_1_mpagominimo_total as delta_1_mpagominimo_total
    , lag(mpagominimo_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpagominimo_total
    , mpagominimo_total - lag_2_mpagominimo_total as delta_2_mpagominimo_total
    , lag(mpagominimo_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mpagominimo_total
    , mpagominimo_total - lag_3_mpagominimo_total as delta_3_mpagominimo_total
    , lag(cseguros_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cseguros_total
    , cseguros_total - lag_1_cseguros_total as delta_1_cseguros_total
    , lag(cseguros_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cseguros_total
    , cseguros_total - lag_2_cseguros_total as delta_2_cseguros_total
    , lag(cseguros_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_cseguros_total
    , cseguros_total - lag_3_cseguros_total as delta_3_cseguros_total
    , lag(cant_transac_tarjetas, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cant_transac_tarjetas
    , cant_transac_tarjetas - lag_1_cant_transac_tarjetas as delta_1_cant_transac_tarjetas
    , lag(cant_transac_tarjetas, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cant_transac_tarjetas
    , cant_transac_tarjetas - lag_2_cant_transac_tarjetas as delta_2_cant_transac_tarjetas
    , lag(cant_transac_tarjetas, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_cant_transac_tarjetas
    , cant_transac_tarjetas - lag_3_cant_transac_tarjetas as delta_3_cant_transac_tarjetas
    , lag(cproductos, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cproductos
    , cproductos - lag_1_cproductos as delta_1_cproductos
    , lag(cproductos, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cproductos
    , cproductos - lag_2_cproductos as delta_2_cproductos
    , lag(cproductos, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_cproductos
    , cproductos - lag_3_cproductos as delta_3_cproductos
    , lag(cdescubierto_preacordado, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cdescubierto_preacordado
    , cdescubierto_preacordado - lag_1_cdescubierto_preacordado as delta_1_cdescubierto_preacordado
    , lag(cdescubierto_preacordado, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cdescubierto_preacordado
    , cdescubierto_preacordado - lag_2_cdescubierto_preacordado as delta_2_cdescubierto_preacordado
    , lag(cdescubierto_preacordado, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_cdescubierto_preacordado
    , cdescubierto_preacordado - lag_3_cdescubierto_preacordado as delta_3_cdescubierto_preacordado
    , lag(msaldopesos_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_msaldopesos_total
    , msaldopesos_total - lag_1_msaldopesos_total as delta_1_msaldopesos_total
    , lag(msaldopesos_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_msaldopesos_total
    , msaldopesos_total - lag_2_msaldopesos_total as delta_2_msaldopesos_total
    , lag(msaldopesos_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_msaldopesos_total
    , msaldopesos_total - lag_3_msaldopesos_total as delta_3_msaldopesos_total
    , lag(transacciones_por_tarjeta, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_transacciones_por_tarjeta
    , transacciones_por_tarjeta - lag_1_transacciones_por_tarjeta as delta_1_transacciones_por_tarjeta
    , lag(transacciones_por_tarjeta, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_transacciones_por_tarjeta
    , transacciones_por_tarjeta - lag_2_transacciones_por_tarjeta as delta_2_transacciones_por_tarjeta
    , lag(transacciones_por_tarjeta, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_transacciones_por_tarjeta
    , transacciones_por_tarjeta - lag_3_transacciones_por_tarjeta as delta_3_transacciones_por_tarjeta
    , lag(antiguedad_edad, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_antiguedad_edad
    , antiguedad_edad - lag_1_antiguedad_edad as delta_1_antiguedad_edad
    , lag(antiguedad_edad, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_antiguedad_edad
    , antiguedad_edad - lag_2_antiguedad_edad as delta_2_antiguedad_edad
    , lag(antiguedad_edad, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_antiguedad_edad
    , antiguedad_edad - lag_3_antiguedad_edad as delta_3_antiguedad_edad
    , lag(mrentabilidad_annual_no_null, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mrentabilidad_annual_no_null
    , mrentabilidad_annual_no_null - lag_1_mrentabilidad_annual_no_null as delta_1_mrentabilidad_annual_no_null
    , lag(mrentabilidad_annual_no_null, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mrentabilidad_annual_no_null
    , mrentabilidad_annual_no_null - lag_2_mrentabilidad_annual_no_null as delta_2_mrentabilidad_annual_no_null
    , lag(mrentabilidad_annual_no_null, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mrentabilidad_annual_no_null
    , mrentabilidad_annual_no_null - lag_3_mrentabilidad_annual_no_null as delta_3_mrentabilidad_annual_no_null
    , lag(mrentabilidad_no_null, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mrentabilidad_no_null
    , mrentabilidad_no_null - lag_1_mrentabilidad_no_null as delta_1_mrentabilidad_no_null
    , lag(mrentabilidad_no_null, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mrentabilidad_no_null
    , mrentabilidad_no_null - lag_2_mrentabilidad_no_null as delta_2_mrentabilidad_no_null
    , lag(mrentabilidad_no_null, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mrentabilidad_no_null
    , mrentabilidad_no_null - lag_3_mrentabilidad_no_null as delta_3_mrentabilidad_no_null
    , lag(mtarjetas_consumo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mtarjetas_consumo
    , mtarjetas_consumo - lag_1_mtarjetas_consumo as delta_1_mtarjetas_consumo
    , lag(mtarjetas_consumo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mtarjetas_consumo
    , mtarjetas_consumo - lag_2_mtarjetas_consumo as delta_2_mtarjetas_consumo
    , lag(mtarjetas_consumo, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_mtarjetas_consumo
    , mtarjetas_consumo - lag_3_mtarjetas_consumo as delta_3_mtarjetas_consumo
    , lag(msaldodolares_total, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_msaldodolares_total
    , msaldodolares_total - lag_1_msaldodolares_total as delta_1_msaldodolares_total
    , lag(msaldodolares_total, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_msaldodolares_total
    , msaldodolares_total - lag_2_msaldodolares_total as delta_2_msaldodolares_total
    , lag(msaldodolares_total, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_msaldodolares_total
    , msaldodolares_total - lag_3_msaldodolares_total as delta_3_msaldodolares_total
from competencia_01_feature_new
    

Unnamed: 0,Success


In [47]:
%%sql
COPY competencia_01_feature_new TO 'competencia_01_feature_new.csv' (FORMAT CSV, HEADER TRUE)

Unnamed: 0,Success


In [27]:
%%sql
select
    lag_2_cseguros_total
    ,clase_ternaria
from competencia_01_lag_delta

Unnamed: 0,lag_2_cseguros_total,clase_ternaria
0,,CONTINUA
1,,CONTINUA
2,1.0,CONTINUA
3,1.0,CONTINUA
4,1.0,
...,...,...
981941,,CONTINUA
981942,0.0,CONTINUA
981943,0.0,CONTINUA
981944,0.0,


TAREA: Escriba una macro para hacer un ratio de dos variables que sea seguro, donde no solo hay campos con null, también esta el problema de la división por cero. Como es costumbre comparta su solución por este canal. Lea https://duckdb.org/docs/sql/functions/numeric.html para referencias de funciones que puede usar.

---

"Claro!" me dirá, mientras lee esto con un mate en la mano, "para cosas fáciles usar SQL alcanza, pero para algo más complicado como crear campos contra el data drifting es difícil".... elija su medicina:

In [25]:
%%sql
select
    foto_mes
    , numero_de_cliente
    , cliente_antiguedad
    , row_number() over (partition by numero_de_cliente order by foto_mes) as cliente_antiguedad_2
    , percent_rank() over (partition by foto_mes order by cliente_antiguedad) as cliente_antiguedad_3
    , cume_dist() over (partition by foto_mes order by cliente_antiguedad) as cliente_antiguedad_4
    , ntile(4) over (partition by foto_mes order by cliente_antiguedad) as cliente_antiguedad_5
    , ntile(10) over (partition by foto_mes order by cliente_antiguedad) as cliente_antiguedad_6
from competencia_01
order by numero_de_cliente, cliente_antiguedad


Unnamed: 0,foto_mes,numero_de_cliente,cliente_antiguedad,cliente_antiguedad_2,cliente_antiguedad_3,cliente_antiguedad_4,cliente_antiguedad_5,cliente_antiguedad_6
0,202101,249221109,300,1,0.947564,0.947947,4,10
1,202102,249221109,301,2,0.947831,0.948213,4,10
2,202103,249221109,302,3,0.948291,0.948670,4,10
3,202104,249221109,303,4,0.948638,0.949016,4,10
4,202105,249221109,304,5,0.948950,0.949327,4,10
...,...,...,...,...,...,...,...,...
981941,202106,1598444941,1,1,0.000000,0.001553,1,1
981942,202106,1598471047,1,1,0.000000,0.001553,1,1
981943,202106,1598515897,1,1,0.000000,0.001553,1,1
981944,202106,1598517059,1,1,0.000000,0.001553,1,1


Qué paso? use las hermosas funciones analíticas de SQL. Al campo cliente_antiguedad (que no sufre de data drifting, solo esta para dar el ejemplo) para cada período (partition by foto_mes) la ordeno (order by cliente_antiguedad) y luego calculo las métricas de orden que pueden encontrar acá https://duckdb.org/docs/sql/window_functions.html#general-purpose-window-functions.

Seguiremos usando las funciones analíticas de SQL, esta vez para calcular features que utilizan valores del pasado.

Qué pasa si quiero agregar un feature que muestre el valor del periodo anterior?


In [None]:
%%sql
select
  numero_de_cliente
  , foto_mes
  , ctrx_quarter
  , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
from competencia_01
limit 10


Unnamed: 0,numero_de_cliente,foto_mes,ctrx_quarter,lag_1_ctrx_quarter
0,249223005,202101,182,
1,249223005,202102,208,182.0
2,249223005,202103,201,208.0
3,249223005,202104,194,201.0
4,249223005,202105,171,194.0
5,249223005,202106,172,171.0
6,249237079,202101,141,
7,249237079,202102,149,141.0
8,249237079,202103,153,149.0
9,249237079,202104,160,153.0


Podemos calcular el delta (diferencia) entre el valor pasado y el presente, para uno o varios meses


In [None]:
%%sql
select
  numero_de_cliente
  , foto_mes
  , ctrx_quarter
  , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
  , ctrx_quarter - lag_1_ctrx_quarter as delta_1_ctrx_quarter
  , ctrx_quarter - lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
from competencia_01
limit 10


Unnamed: 0,numero_de_cliente,foto_mes,ctrx_quarter,lag_1_ctrx_quarter,delta_1_ctrx_quarter,lag_2_ctrx_quarter
0,249223005,202101,182,,,
1,249223005,202102,208,182.0,26.0,
2,249223005,202103,201,208.0,-7.0,19.0
3,249223005,202104,194,201.0,-7.0,-14.0
4,249223005,202105,171,194.0,-23.0,-30.0
5,249223005,202106,172,171.0,1.0,-22.0
6,249237079,202101,141,,,
7,249237079,202102,149,141.0,8.0,
8,249237079,202103,153,149.0,4.0,12.0
9,249237079,202104,160,153.0,7.0,11.0


Si necesitamos ya no solo traer un valor del pasado, sino una secuencia de valores, por ejemplo para calcular la media móvil con los últimos 3 meses anteriores? se puede hacer fácilmente


In [None]:
%%sql
select
  numero_de_cliente
  , foto_mes
  , ctrx_quarter
  , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
  , lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
  , lag(ctrx_quarter, 3) over (partition by numero_de_cliente order by foto_mes) as lag_3_ctrx_quarter
  , avg(ctrx_quarter) over (partition by numero_de_cliente
                            order by foto_mes
                            rows between 3 preceding and current row) as avg_3_ctrx_quarter
from competencia_01
order by numero_de_cliente, foto_mes desc
limit 10


Unnamed: 0,numero_de_cliente,foto_mes,ctrx_quarter,lag_1_ctrx_quarter,lag_2_ctrx_quarter,lag_3_ctrx_quarter,avg_3_ctrx_quarter
0,249221109,202106,199,199.0,188.0,174.0,190.0
1,249221109,202105,199,188.0,174.0,161.0,180.5
2,249221109,202104,188,174.0,161.0,166.0,172.25
3,249221109,202103,174,161.0,166.0,,167.0
4,249221109,202102,161,166.0,,,163.5
5,249221109,202101,166,,,,166.0
6,249221468,202106,191,182.0,182.0,170.0,181.25
7,249221468,202105,182,182.0,170.0,154.0,172.0
8,249221468,202104,182,170.0,154.0,145.0,162.75
9,249221468,202103,170,154.0,145.0,,156.333333


Si embargo puede resultar incómodo escribir constantemente el over partition sobre todo si se buscan aplicar muchas veces para distintas funciones. Para reducir el código se puede usar la siguiente sintaxis



In [None]:
%%sql
select
  numero_de_cliente
  , foto_mes
  , ctrx_quarter
  , avg(ctrx_quarter) over ventana_3 as ctrx_quarter_media_3
  , max(ctrx_quarter) over ventana_3 as ctrx_quarter_max_3
  , min(ctrx_quarter) over ventana_3 as ctrx_quarter_min_3
from competencia_01
window ventana_3 as (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row)
limit 10


Unnamed: 0,numero_de_cliente,foto_mes,ctrx_quarter,ctrx_quarter_media_3,ctrx_quarter_max_3,ctrx_quarter_min_3
0,249223005,202101,182,182.0,182,182
1,249223005,202102,208,195.0,208,182
2,249223005,202103,201,197.0,208,182
3,249223005,202104,194,196.25,208,182
4,249223005,202105,171,193.5,208,171
5,249223005,202106,172,184.5,201,171
6,249237079,202101,141,141.0,141,141
7,249237079,202102,149,145.0,149,141
8,249237079,202103,153,147.666667,153,141
9,249237079,202104,160,150.75,160,141


Para saber más que funciones tenemos disponibles, recomiendo ver los siguientes links:

https://duckdb.org/docs/archive/0.8.1/sql/window_functions
https://duckdb.org/docs/archive/0.8.1/sql/aggregates
Un caso más, que ni me voy a molestar en explicar que significa...


In [None]:
%%sql
select
  numero_de_cliente
  , foto_mes
  , ctrx_quarter
  ,regr_slope(ctrx_quarter, cliente_antiguedad) over ventana_3 as ctrx_quarter_slope_3
from competencia_01
window ventana_3 as (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row)
limit 10


... Alguno dirá "tenemos que escribir todo esto a mano? Son muchas variables!". Bueno no, use los conocimientos de programación para que la computadora trabaje para usted. Si tenemos una lista de campos


In [None]:
campos = ['active_quarter', 'cliente_vip', 'internet', 'cliente_edad', 'cliente_antiguedad', 'mrentabilidad']


Podemos hacer un script muy sencillo que nos genere el texto que hay que poner en una query para generar esas variables


In [None]:
nuevos_features = ""
for campo in campos:
  nuevos_features += f"\n, regr_slope({campo}, cliente_antiguedad) over ventana_3 as ctrx_{campo}_slope_3"
print(nuevos_features)



, regr_slope(active_quarter, cliente_antiguedad) over ventana_3 as ctrx_active_quarter_slope_3
, regr_slope(cliente_vip, cliente_antiguedad) over ventana_3 as ctrx_cliente_vip_slope_3
, regr_slope(internet, cliente_antiguedad) over ventana_3 as ctrx_internet_slope_3
, regr_slope(cliente_edad, cliente_antiguedad) over ventana_3 as ctrx_cliente_edad_slope_3
, regr_slope(cliente_antiguedad, cliente_antiguedad) over ventana_3 as ctrx_cliente_antiguedad_slope_3
, regr_slope(mrentabilidad, cliente_antiguedad) over ventana_3 as ctrx_mrentabilidad_slope_3





Con la salida de esa celda, arme la query agregando las nuevas líneas y la ejecuta.

Lo que acabamos de hacer de manera muy simple es como "funcionan" sistemas como **dbt** que están tan de moda en el mundo de los datos.

La última reflexión, la creación de nuevas features es un proceso computacionalmente rápido pero intenso. Si ejecutó lo anterior pudo haber visto que en poco minutos tenía sus nuevas variables. Pero, también pudo haberle fallado por temas de recursos. Miles de variables necesitan los recursos adecuados. Use la nube, una máquina grande, al menos que sepa bien como optimizar las queries.


Y a no olvidarse guardar las nueva tabla

In [None]:
%%sql
COPY competencia_01 TO '{dataset_path}competencia_01_fe.csv' (FORMAT CSV, HEADER TRUE);
