# Feature Engineering en SQL

A continuación, veremos cómo calcular diferentes variables para el feature engineering utilizando SQL.


In [1]:
import duckdb
import pandas as pd
from sqlalchemy import create_engine

%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [2]:
base_path = '/home/fedepicado/'
modelos_path = base_path + 'buckets/b1/modelos/'
db_path = base_path + 'buckets/b1/db/'
dataset_path = base_path + 'buckets/b1/datasets/'
exp_path = base_path + 'buckets/b1/exp/'
dataset_file = 'competencia_02_DQ.parquet'
full_path = dataset_path + dataset_file


In [3]:
%%sql
ROLLBACK;

Unnamed: 0,Success


In [4]:
%%sql
create or replace table competencia_02_DQ as
select
    *
from read_parquet('../../../buckets/b1/datasets/competencia_02_DQ.parquet')

Unnamed: 0,Success


In [5]:
df=pd.read_parquet(full_path)

In [6]:
df.shape

(4735593, 149)

In [7]:
campos=df.columns.to_list()
print(campos[-1])
campos.pop(-1) #elimino clase ternaria
print(campos[-1])

clase_ternaria
Visa_mpagominimo


In [8]:
campos

['numero_de_cliente',
 'foto_mes',
 'active_quarter',
 'cliente_vip',
 'internet',
 'cliente_edad',
 'cliente_antiguedad',
 'mrentabilidad',
 'mrentabilidad_annual',
 'mcomisiones',
 'mactivos_margen',
 'mpasivos_margen',
 'cproductos',
 'tcuentas',
 'ccuenta_corriente',
 'mcuenta_corriente_adicional',
 'mcuenta_corriente',
 'ccaja_ahorro',
 'mcaja_ahorro',
 'mcaja_ahorro_adicional',
 'mcaja_ahorro_dolares',
 'cdescubierto_preacordado',
 'mcuentas_saldo',
 'ctarjeta_debito',
 'ctarjeta_debito_transacciones',
 'mautoservicio',
 'ctarjeta_visa',
 'ctarjeta_visa_transacciones',
 'mtarjeta_visa_consumo',
 'ctarjeta_master',
 'ctarjeta_master_transacciones',
 'mtarjeta_master_consumo',
 'cprestamos_prendarios',
 'mprestamos_prendarios',
 'cprestamos_hipotecarios',
 'mprestamos_hipotecarios',
 'cplazo_fijo',
 'mplazo_fijo_dolares',
 'mplazo_fijo_pesos',
 'cinversion1',
 'minversion1_pesos',
 'minversion1_dolares',
 'cinversion2',
 'minversion2',
 'cseguro_vida',
 'cseguro_auto',
 'cseguro_vi

In [9]:
%%sql
CREATE OR REPLACE MACRO suma_sin_null(a, b) AS ifnull(a, 0) + ifnull(b, 0)

Unnamed: 0,Success


In [10]:
%%sql
CREATE OR REPLACE MACRO div_sin_cero(a, b) as
    case
        when ifnull(b, 0) = 0 THEN NULL
        else ifnull(a, 0) / ifnull(b, 1)
    end

Unnamed: 0,Success


### Master y Visa, una sola

In [35]:
# Separar las columnas por tipo
master = [col for col in campos if 'Master' in col]
visa = [col for col in campos if 'Visa' in col]

sumas = []

for m_col, v_col in zip(master, visa):
    # Crear la suma usando la función 'suma_sin_null'
    suma = f"\n    suma_sin_null({m_col}, {v_col}) as {m_col.replace('Master_', '')}_total"
    sumas.append(suma)

consulta_sql = f"""
select 
    numero_de_cliente,
    {', '.join(sumas)}
from competencia_01;
"""

print(consulta_sql)


select 
    numero_de_cliente,
    
    suma_sin_null(Master_delinquency, Visa_delinquency) as delinquency_total, 
    suma_sin_null(Master_status, Visa_status) as status_total, 
    suma_sin_null(Master_mfinanciacion_limite, Visa_mfinanciacion_limite) as mfinanciacion_limite_total, 
    suma_sin_null(Master_Fvencimiento, Visa_Fvencimiento) as Fvencimiento_total, 
    suma_sin_null(Master_msaldototal, Visa_msaldototal) as msaldototal_total, 
    suma_sin_null(Master_msaldopesos, Visa_msaldopesos) as msaldopesos_total, 
    suma_sin_null(Master_msaldodolares, Visa_msaldodolares) as msaldodolares_total, 
    suma_sin_null(Master_mconsumospesos, Visa_mconsumospesos) as mconsumospesos_total, 
    suma_sin_null(Master_mconsumosdolares, Visa_mconsumosdolares) as mconsumosdolares_total, 
    suma_sin_null(Master_mlimitecompra, Visa_mlimitecompra) as mlimitecompra_total, 
    suma_sin_null(Master_madelantopesos, Visa_madelantopesos) as madelantopesos_total, 
    suma_sin_null(Master_madelanto

In [11]:
%%sql
create or replace table competencia_02_DQ_agrupacion_var as
select
    *,
    suma_sin_null(Master_delinquency, Visa_delinquency) as delinquency_total,
    suma_sin_null(Master_status, Visa_status) as status_total,
    suma_sin_null(Master_mfinanciacion_limite, Visa_mfinanciacion_limite) as mfinanciacion_limite_total,
    suma_sin_null(Master_msaldototal, Visa_msaldototal) as msaldototal_total,
    suma_sin_null(Master_msaldopesos, Visa_msaldopesos) as msaldopesos_total,
    suma_sin_null(Master_msaldodolares, Visa_msaldodolares) as msaldodolares_total,
    suma_sin_null(Master_mconsumospesos, Visa_mconsumospesos) as mconsumospesos_total,
    suma_sin_null(Master_mconsumosdolares, Visa_mconsumosdolares) as mconsumosdolares_total,
    suma_sin_null(Master_mlimitecompra, Visa_mlimitecompra) as mlimitecompra_total,
    suma_sin_null(Master_madelantopesos, Visa_madelantopesos) as madelantopesos_total,
    suma_sin_null(Master_madelantodolares, Visa_madelantodolares) as madelantodolares_total,
    suma_sin_null(Master_mpagado, Visa_mpagado) as mpagado_total,
    suma_sin_null(Master_mpagospesos, Visa_mpagospesos) as mpagospesos_total,
    suma_sin_null(Master_mpagosdolares, Visa_mpagosdolares) as mpagosdolares_total,
    suma_sin_null(Master_mconsumototal, Visa_mconsumototal) as mconsumototal_total,
    suma_sin_null(Master_cconsumos, Visa_cconsumos) as cconsumos_total,
    suma_sin_null(Master_cadelantosefectivo, Visa_cadelantosefectivo) as cadelantosefectivo_total,
    suma_sin_null(Master_mpagominimo, Visa_mpagominimo) as mpagominimo_total,
    suma_sin_null(ctarjeta_visa, ctarjeta_master) as cant_tarjetas,
    suma_sin_null(ctarjeta_visa_transacciones, ctarjeta_master_transacciones) as cant_transac_tarjetas,
    suma_sin_null(mtarjeta_visa_consumo, mtarjeta_master_consumo) as mtarjetas_consumo,
    suma_sin_null(suma_sin_null(cseguro_vida, cseguro_auto), suma_sin_null(cseguro_vivienda, cseguro_accidentes_personales)) as c_seguros_total
from competencia_02_DQ

Unnamed: 0,Success


In [22]:
%%sql
select
    foto_mes
    , mfinanciacion_limite_total
from competencia_02_DQ_agrupacion_var
limit 5

Unnamed: 0,foto_mes,mfinanciacion_limite_total
0,201901,483906.16
1,201902,501305.87
2,201903,549818.98
3,201904,608083.2
4,201905,608083.2


In [13]:
149-46+22

125

In [15]:
columns_to_drop = [
    "Master_delinquency", "Visa_delinquency", "Master_status", "Visa_status",
    "Master_mfinanciacion_limite", "Visa_mfinanciacion_limite", "Master_msaldototal", "Visa_msaldototal",
    "Master_msaldopesos", "Visa_msaldopesos", "Master_msaldodolares", "Visa_msaldodolares",
    "Master_mconsumospesos", "Visa_mconsumospesos", "Master_mconsumosdolares", "Visa_mconsumosdolares",
    "Master_mlimitecompra", "Visa_mlimitecompra", "Master_madelantopesos", "Visa_madelantopesos",
    "Master_madelantodolares", "Visa_madelantodolares", "Master_mpagado", "Visa_mpagado",
    "Master_mpagospesos", "Visa_mpagospesos", "Master_mpagosdolares", "Visa_mpagosdolares",
    "Master_mconsumototal", "Visa_mconsumototal", "Master_cconsumos", "Visa_cconsumos",
    "Master_cadelantosefectivo", "Visa_cadelantosefectivo", "Master_mpagominimo", "Visa_mpagominimo",
    "ctarjeta_visa", "ctarjeta_master", "ctarjeta_visa_transacciones", "ctarjeta_master_transacciones",
    "mtarjeta_visa_consumo", "mtarjeta_master_consumo",
    "cseguro_vida", "cseguro_auto", "cseguro_vivienda", "cseguro_accidentes_personales"
]

In [16]:
len(columns_to_drop)

46

In [17]:
# Ejecutar cada consulta individualmente
for column in columns_to_drop:
    query = f"ALTER TABLE competencia_02_DQ_agrupacion_var DROP COLUMN {column};"
    get_ipython().run_line_magic('sql', query)
    print(f"Columna {column} eliminada.")


Columna Master_delinquency eliminada.
Columna Visa_delinquency eliminada.
Columna Master_status eliminada.
Columna Visa_status eliminada.
Columna Master_mfinanciacion_limite eliminada.
Columna Visa_mfinanciacion_limite eliminada.
Columna Master_msaldototal eliminada.
Columna Visa_msaldototal eliminada.
Columna Master_msaldopesos eliminada.
Columna Visa_msaldopesos eliminada.
Columna Master_msaldodolares eliminada.
Columna Visa_msaldodolares eliminada.
Columna Master_mconsumospesos eliminada.
Columna Visa_mconsumospesos eliminada.
Columna Master_mconsumosdolares eliminada.
Columna Visa_mconsumosdolares eliminada.
Columna Master_mlimitecompra eliminada.
Columna Visa_mlimitecompra eliminada.
Columna Master_madelantopesos eliminada.
Columna Visa_madelantopesos eliminada.
Columna Master_madelantodolares eliminada.
Columna Visa_madelantodolares eliminada.
Columna Master_mpagado eliminada.
Columna Visa_mpagado eliminada.
Columna Master_mpagospesos eliminada.
Columna Visa_mpagospesos eliminada

In [18]:
## guardo el df con la agrupacion de variables
dataset_output= dataset_path + 'competencia_02_DQ_agrupacion_var.parquet'

In [32]:
%%sql
SELECT COUNT(*) AS num_columnas
FROM pragma_table_info('competencia_02_DQ_agrupacion_var');

Unnamed: 0,num_columnas
0,126


In [33]:
%%sql
ROLLBACK;

Unnamed: 0,Success


In [34]:
%%sql
COPY competencia_02_DQ_agrupacion_var TO '{{dataset_output}}' (FORMAT PARQUET);

Unnamed: 0,Success


In [35]:
var_monetarias= ["mrentabilidad","mrentabilidad_annual","mcomisiones","mactivos_margen","mpasivos_margen","mcuenta_corriente",
               "mcaja_ahorro","mcaja_ahorro_adicional","mcaja_ahorro_dolares","mcuentas_saldo","mautoservicio","mtarjeta_visa_consumo",
               "mtarjeta_master_consumo","mtarjeta_master_consumo","mprestamos_prendarios","mprestamos_hipotecarios","mplazo_fijo_dolares",
               "mplazo_fijo_pesos","minversion1_pesos","minversion1_dolares","minversion2","mpayroll","mpayroll2","mcuenta_debitos_automaticos",
               "mttarjeta_visa_debitos_automaticos","mttarjeta_master_debitos_automaticos","mpagomiscuentas","mcajeros_propios_descuentos",
               "mtarjeta_visa_descuentos","mtarjeta_master_descuentos","mcomisiones_mantenimiento","mcomisiones_otras","mforex_buy",
               "mtransferencias_recibidas","mtransferencias_emitidas","mextraccion_autoservicio","mcheques_depositados","mcheques_emitidos",
               "mcheques_depositados_rechazados","mcheques_emitidos_rechazados","matm","matm_other","mfinanciacion_limite_total", 
                "msaldototal_total", "msaldopesos_total","msaldodolares_total", "mconsumospesos_total",
                "mconsumosdolares_total", "mlimitecompra_total", "madelantopesos_total", "madelantodolares_total", 
                "mpagado_total", "mpagospesos_total", "mpagosdolares_total", "mconsumototal_total", 
                "mpagominimo_total", "mtarjetas_consumo"]

## NTILE

In [36]:
nuevos_ratios = ""
for campo in var_monetarias:
    # Añadir las columnas de lag y delta que ya tienes
    nuevos_ratios += f"\n    , ntile(10) over (partition by foto_mes order by {campo}) as {campo}_cuantile"

consulta_sql_Ntile = f"""
CREATE OR REPLACE TABLE competencia_01_NTILE(10) AS
select
    numero_de_cliente,
    foto_mes
    {nuevos_ratios.strip()}
from competencia_01_DQ
"""

print(consulta_sql_Ntile)


CREATE OR REPLACE TABLE competencia_01_NTILE(10) AS
select
    numero_de_cliente,
    foto_mes
    , ntile(10) over (partition by foto_mes order by mrentabilidad) as mrentabilidad_cuantile
    , ntile(10) over (partition by foto_mes order by mrentabilidad_annual) as mrentabilidad_annual_cuantile
    , ntile(10) over (partition by foto_mes order by mcomisiones) as mcomisiones_cuantile
    , ntile(10) over (partition by foto_mes order by mactivos_margen) as mactivos_margen_cuantile
    , ntile(10) over (partition by foto_mes order by mpasivos_margen) as mpasivos_margen_cuantile
    , ntile(10) over (partition by foto_mes order by mcuenta_corriente) as mcuenta_corriente_cuantile
    , ntile(10) over (partition by foto_mes order by mcaja_ahorro) as mcaja_ahorro_cuantile
    , ntile(10) over (partition by foto_mes order by mcaja_ahorro_adicional) as mcaja_ahorro_adicional_cuantile
    , ntile(10) over (partition by foto_mes order by mcaja_ahorro_dolares) as mcaja_ahorro_dolares_cuantile
  

In [35]:
%%sql
CREATE OR REPLACE TABLE competencia_01_NTILE_10 AS
select
    numero_de_cliente,
    foto_mes
    , ntile(10) over (partition by foto_mes order by mpayroll) as mpayroll_cuantile
    , ntile(10) over (partition by foto_mes order by ctrx_quarter) as ctrx_quarter_cuantile
    , ntile(10) over (partition by foto_mes order by mprestamos_personales) as mprestamos_personales_cuantile
    , ntile(10) over (partition by foto_mes order by mactivos_margen) as mactivos_margen_cuantile
    , ntile(10) over (partition by foto_mes order by mpasivos_margen) as mpasivos_margen_cuantile
    , ntile(10) over (partition by foto_mes order by mcuentas_saldo) as mcuentas_saldo_cuantile
    , ntile(10) over (partition by foto_mes order by mcaja_ahorro) as mcaja_ahorro_cuantile
    , ntile(10) over (partition by foto_mes order by mcomisiones_mantenimiento) as mcomisiones_mantenimiento_cuantile
    , ntile(10) over (partition by foto_mes order by internet) as internet_cuantile
    , ntile(10) over (partition by foto_mes order by tcallcenter) as tcallcenter_cuantile
    , ntile(10) over (partition by foto_mes order by mpagomiscuentas) as mpagomiscuentas_cuantile
    , ntile(10) over (partition by foto_mes order by ccaja_ahorro) as ccaja_ahorro_cuantile
    , ntile(10) over (partition by foto_mes order by ccomisiones_mantenimiento) as ccomisiones_mantenimiento_cuantile
from competencia_01_DQ
where foto_mes =202104 or foto_mes =202106

Unnamed: 0,Success


## Lag delta r_diff

In [55]:
nuevos_campos = ""

for campo in campos:
    nuevos_campos += f"""
    , {campo}
    , LAG({campo}, 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS lag_1_{campo}
    , LAG({campo}, 2) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS lag_2_{campo}
    , LAG({campo}, 3) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS lag_3_{campo}
    , {campo} - LAG({campo}, 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS dif1_{campo}
    , ({campo} - LAG({campo}, 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes)) - LAG({campo} - LAG({campo}, 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes), 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS dif2_{campo}
    , AVG({campo}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS {campo}_media_5
    , LAG(AVG({campo}) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes ROWS BETWEEN 4 PRECEDING AND CURRENT ROW), 1) OVER (PARTITION BY numero_de_cliente ORDER BY foto_mes) AS lagged_{campo}_media_5
    """

consulta_sql = f"""
CREATE OR REPLACE TABLE competencia_01_features AS
SELECT
    numero_de_cliente,
    foto_mes
    {nuevos_campos}
FROM competencia_01;
"""

print(consulta_sql)


CREATE OR REPLACE TABLE competencia_01_lag_delta_ratios AS
select
    numero_de_cliente,
    foto_mes
    , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
    , ctrx_quarter - lag_1_ctrx_quarter as delta_1_ctrx_quarter
    , lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
    , ctrx_quarter - lag_2_ctrx_quarter as delta_2_ctrx_quarter
    , div_sin_cero(delta_1_ctrx_quarter,delta_2_ctrx_quarter) as ratio_ctrx_quarter
    , lag(cpayroll_trx, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cpayroll_trx
    , cpayroll_trx - lag_1_cpayroll_trx as delta_1_cpayroll_trx
    , lag(cpayroll_trx, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cpayroll_trx
    , cpayroll_trx - lag_2_cpayroll_trx as delta_2_cpayroll_trx
    , div_sin_cero(delta_1_cpayroll_trx,delta_2_cpayroll_trx) as ratio_cpayroll_trx
    , lag(mpayroll, 1) over (partition by numero_de_c

In [37]:
# %%sql
# CREATE OR REPLACE TABLE competencia_01_lag_delta_ratios AS
# select
#     numero_de_cliente,
#     foto_mes
#     , lag(ctrx_quarter, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_ctrx_quarter
#     , ctrx_quarter - lag_1_ctrx_quarter as delta_1_ctrx_quarter
#     , lag(ctrx_quarter, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_ctrx_quarter
#     , ctrx_quarter - lag_2_ctrx_quarter as delta_2_ctrx_quarter
#     , div_sin_cero(delta_1_ctrx_quarter,delta_2_ctrx_quarter) as ratio_ctrx_quarter
#     , lag(cpayroll_trx, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cpayroll_trx
#     , cpayroll_trx - lag_1_cpayroll_trx as delta_1_cpayroll_trx
#     , lag(cpayroll_trx, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cpayroll_trx
#     , cpayroll_trx - lag_2_cpayroll_trx as delta_2_cpayroll_trx
#     , div_sin_cero(delta_1_cpayroll_trx,delta_2_cpayroll_trx) as ratio_cpayroll_trx
#     , lag(mpayroll, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpayroll
#     , mpayroll - lag_1_mpayroll as delta_1_mpayroll
#     , lag(mpayroll, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpayroll
#     , mpayroll - lag_2_mpayroll as delta_2_mpayroll
#     , div_sin_cero(delta_1_mpayroll,delta_2_mpayroll) as ratio_mpayroll
#     , lag(mpayroll2, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpayroll2
#     , mpayroll2 - lag_1_mpayroll2 as delta_1_mpayroll2
#     , lag(mpayroll2, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpayroll2
#     , mpayroll2 - lag_2_mpayroll2 as delta_2_mpayroll2
#     , div_sin_cero(delta_1_mpayroll2,delta_2_mpayroll2) as ratio_mpayroll2
#     , lag(cpayroll2_trx, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cpayroll2_trx
#     , cpayroll2_trx - lag_1_cpayroll2_trx as delta_1_cpayroll2_trx
#     , lag(cpayroll2_trx, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cpayroll2_trx
#     , cpayroll2_trx - lag_2_cpayroll2_trx as delta_2_cpayroll2_trx
#     , div_sin_cero(delta_1_cpayroll2_trx,delta_2_cpayroll2_trx) as ratio_cpayroll2_trx
#     , lag(mpasivos_margen, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mpasivos_margen
#     , mpasivos_margen - lag_1_mpasivos_margen as delta_1_mpasivos_margen
#     , lag(mpasivos_margen, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mpasivos_margen
#     , mpasivos_margen - lag_2_mpasivos_margen as delta_2_mpasivos_margen
#     , div_sin_cero(delta_1_mpasivos_margen,delta_2_mpasivos_margen) as ratio_mpasivos_margen
#     , lag(mprestamos_personales, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mprestamos_personales
#     , mprestamos_personales - lag_1_mprestamos_personales as delta_1_mprestamos_personales
#     , lag(mprestamos_personales, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mprestamos_personales
#     , mprestamos_personales - lag_2_mprestamos_personales as delta_2_mprestamos_personales
#     , div_sin_cero(delta_1_mprestamos_personales,delta_2_mprestamos_personales) as ratio_mprestamos_personales
#     , lag(mcuentas_saldo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcuentas_saldo
#     , mcuentas_saldo - lag_1_mcuentas_saldo as delta_1_mcuentas_saldo
#     , lag(mcuentas_saldo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcuentas_saldo
#     , mcuentas_saldo - lag_2_mcuentas_saldo as delta_2_mcuentas_saldo
#     , div_sin_cero(delta_1_mcuentas_saldo,delta_2_mcuentas_saldo) as ratio_mcuentas_saldo
#     , lag(mcaja_ahorro, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcaja_ahorro
#     , mcaja_ahorro - lag_1_mcaja_ahorro as delta_1_mcaja_ahorro
#     , lag(mcaja_ahorro, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcaja_ahorro
#     , mcaja_ahorro - lag_2_mcaja_ahorro as delta_2_mcaja_ahorro
#     , div_sin_cero(delta_1_mcaja_ahorro,delta_2_mcaja_ahorro) as ratio_mcaja_ahorro
#     , lag(mcomisiones_mantenimiento, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcomisiones_mantenimiento
#     , mcomisiones_mantenimiento - lag_1_mcomisiones_mantenimiento as delta_1_mcomisiones_mantenimiento
#     , lag(mcomisiones_mantenimiento, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcomisiones_mantenimiento
#     , mcomisiones_mantenimiento - lag_2_mcomisiones_mantenimiento as delta_2_mcomisiones_mantenimiento
#     , div_sin_cero(delta_1_mcomisiones_mantenimiento,delta_2_mcomisiones_mantenimiento) as ratio_mcomisiones_mantenimiento
#     , lag(cproductos, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cproductos
#     , cproductos - lag_1_cproductos as delta_1_cproductos
#     , lag(cproductos, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cproductos
#     , cproductos - lag_2_cproductos as delta_2_cproductos
#     , div_sin_cero(delta_1_cproductos,delta_2_cproductos) as ratio_cproductos
#     , lag(cdescubierto_preacordado, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cdescubierto_preacordado
#     , cdescubierto_preacordado - lag_1_cdescubierto_preacordado as delta_1_cdescubierto_preacordado
#     , lag(cdescubierto_preacordado, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cdescubierto_preacordado
#     , cdescubierto_preacordado - lag_2_cdescubierto_preacordado as delta_2_cdescubierto_preacordado
#     , div_sin_cero(delta_1_cdescubierto_preacordado,delta_2_cdescubierto_preacordado) as ratio_cdescubierto_preacordado
#     , lag(mtarjeta_visa_consumo, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mtarjeta_visa_consumo
#     , mtarjeta_visa_consumo - lag_1_mtarjeta_visa_consumo as delta_1_mtarjeta_visa_consumo
#     , lag(mtarjeta_visa_consumo, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mtarjeta_visa_consumo
#     , mtarjeta_visa_consumo - lag_2_mtarjeta_visa_consumo as delta_2_mtarjeta_visa_consumo
#     , div_sin_cero(delta_1_mtarjeta_visa_consumo,delta_2_mtarjeta_visa_consumo) as ratio_mtarjeta_visa_consumo
#     , lag(cprestamos_personales, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_cprestamos_personales
#     , cprestamos_personales - lag_1_cprestamos_personales as delta_1_cprestamos_personales
#     , lag(cprestamos_personales, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_cprestamos_personales
#     , cprestamos_personales - lag_2_cprestamos_personales as delta_2_cprestamos_personales
#     , div_sin_cero(delta_1_cprestamos_personales,delta_2_cprestamos_personales) as ratio_cprestamos_personales
#     , lag(mcuenta_corriente, 1) over (partition by numero_de_cliente order by foto_mes) as lag_1_mcuenta_corriente
#     , mcuenta_corriente - lag_1_mcuenta_corriente as delta_1_mcuenta_corriente
#     , lag(mcuenta_corriente, 2) over (partition by numero_de_cliente order by foto_mes) as lag_2_mcuenta_corriente
#     , mcuenta_corriente - lag_2_mcuenta_corriente as delta_2_mcuenta_corriente
#     , div_sin_cero(delta_1_mcuenta_corriente,delta_2_mcuenta_corriente) as ratio_mcuenta_corriente
# from competencia_01
# where foto_mes =202104 or foto_mes =202106



Unnamed: 0,Success


In [36]:
# %%sql
# select *
# from competencia_01_lag_delta_ratios
# where foto_mes =202104 or foto_mes =202106


Unnamed: 0,numero_de_cliente,foto_mes,lag_1_ctrx_quarter,delta_1_ctrx_quarter,lag_2_ctrx_quarter,delta_2_ctrx_quarter,ratio_ctrx_quarter,lag_1_cpayroll_trx,delta_1_cpayroll_trx,lag_2_cpayroll_trx,...,lag_1_cprestamos_personales,delta_1_cprestamos_personales,lag_2_cprestamos_personales,delta_2_cprestamos_personales,ratio_cprestamos_personales,lag_1_mcuenta_corriente,delta_1_mcuenta_corriente,lag_2_mcuenta_corriente,delta_2_mcuenta_corriente,ratio_mcuenta_corriente
0,249221109,202104,174.0,14.0,161.0,27.0,0.518519,2.0,-1.0,1.0,...,0.0,0.0,0.0,0.0,,-485.88,485.88,0.00,0.00,
1,249278864,202104,110.0,0.0,,,,1.0,0.0,,...,0.0,0.0,,,,0.00,0.00,,,
2,249296729,202104,63.0,8.0,55.0,16.0,0.500000,2.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,,2990.46,-2990.46,3196.71,-3196.71,0.935481
3,249470734,202104,116.0,-4.0,112.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,0.00,0.00,0.00,0.00,
4,249590846,202104,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,0.00,0.00,0.00,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328961,1588492803,202104,9.0,39.0,,,,0.0,0.0,,...,0.0,0.0,,,,0.00,0.00,,,
328962,1588725801,202104,,,,,,,,,...,,,,,,,,,,
328963,1588924016,202104,2.0,8.0,,,,0.0,2.0,,...,0.0,0.0,,,,0.00,-1416.04,,,
328964,1591753879,202104,,,,,,,,,...,,,,,,,,,,


## Ahora voy con el promedio de cada campo, haciendo dev

In [38]:
nuevos_features = ""
for campo in campos:
    # Calcular el promedio móvil de los últimos 3 meses
    nuevos_features += f"\n    , avg({campo}) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_{campo}"
    
    # Calcular la desviación del valor actual con respecto al promedio móvil
    nuevos_features += f"\n    , {campo} - avg_3_{campo} as dev_3_{campo}"

# Construir la consulta SQL
consulta_sql = f"""
CREATE OR REPLACE TABLE competencia_01_DQ_avg AS
select
    numero_de_cliente
    , foto_mes 
    {nuevos_features.strip()}
FROM competencia_01_DQ
where foto_mes =202104 or foto_mes =202106
"""

print(consulta_sql)


CREATE OR REPLACE TABLE competencia_01_DQ_avg AS
select
    numero_de_cliente
    , foto_mes 
    , avg(mpayroll) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mpayroll
    , mpayroll - avg_3_mpayroll as dev_3_mpayroll
    , avg(ctrx_quarter) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_ctrx_quarter
    , ctrx_quarter - avg_3_ctrx_quarter as dev_3_ctrx_quarter
    , avg(mprestamos_personales) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mprestamos_personales
    , mprestamos_personales - avg_3_mprestamos_personales as dev_3_mprestamos_personales
    , avg(mactivos_margen) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mactivos_margen
    , mactivos_margen - avg_3_mactivos_margen as dev_3_mactivos_margen
    , avg(mpasivos_margen) over (partition by

In [39]:
%%sql

CREATE OR REPLACE TABLE competencia_01_DQ_avg AS
select
    numero_de_cliente
    , foto_mes 
    , avg(mpayroll) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mpayroll
    , mpayroll - avg_3_mpayroll as dev_3_mpayroll
    , avg(ctrx_quarter) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_ctrx_quarter
    , ctrx_quarter - avg_3_ctrx_quarter as dev_3_ctrx_quarter
    , avg(mprestamos_personales) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mprestamos_personales
    , mprestamos_personales - avg_3_mprestamos_personales as dev_3_mprestamos_personales
    , avg(mactivos_margen) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mactivos_margen
    , mactivos_margen - avg_3_mactivos_margen as dev_3_mactivos_margen
    , avg(mpasivos_margen) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mpasivos_margen
    , mpasivos_margen - avg_3_mpasivos_margen as dev_3_mpasivos_margen
    , avg(mcuentas_saldo) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mcuentas_saldo
    , mcuentas_saldo - avg_3_mcuentas_saldo as dev_3_mcuentas_saldo
    , avg(mcaja_ahorro) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mcaja_ahorro
    , mcaja_ahorro - avg_3_mcaja_ahorro as dev_3_mcaja_ahorro
    , avg(internet) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_internet
    , internet - avg_3_internet as dev_3_internet
    , avg(tcallcenter) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_tcallcenter
    , tcallcenter - avg_3_tcallcenter as dev_3_tcallcenter
    , avg(mpagomiscuentas) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mpagomiscuentas
    , mpagomiscuentas - avg_3_mpagomiscuentas as dev_3_mpagomiscuentas
    , avg(ccaja_ahorro) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_ccaja_ahorro
    , ccaja_ahorro - avg_3_ccaja_ahorro as dev_3_ccaja_ahorro
    , avg(mcomisiones_mantenimiento) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_mcomisiones_mantenimiento
    , mcomisiones_mantenimiento - avg_3_mcomisiones_mantenimiento as dev_3_mcomisiones_mantenimiento
    , avg(ccomisiones_mantenimiento) over (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row) as avg_3_ccomisiones_mantenimiento
    , ccomisiones_mantenimiento - avg_3_ccomisiones_mantenimiento as dev_3_ccomisiones_mantenimiento
FROM competencia_01_DQ
where foto_mes =202104 or foto_mes =202106




Unnamed: 0,Success


## reg_slope

In [41]:
nuevos_features = ""
for campo in campos:
  nuevos_features +=  f"\n    ,regr_slope({campo}, cliente_antiguedad) over ventana_3 as {campo}_slope_3"


consulta_sql = f"""
CREATE OR REPLACE TABLE competencia_01_regr_slope AS
select
    numero_de_cliente
    , foto_mes 
    {nuevos_features.strip()}
from competencia_01
window ventana_3 as (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row)
"""

print(consulta_sql)


CREATE OR REPLACE TABLE competencia_01_regr_slope AS
select
    numero_de_cliente
    , foto_mes 
    ,regr_slope(mpayroll, cliente_antiguedad) over ventana_3 as mpayroll_slope_3
    ,regr_slope(ctrx_quarter, cliente_antiguedad) over ventana_3 as ctrx_quarter_slope_3
    ,regr_slope(mprestamos_personales, cliente_antiguedad) over ventana_3 as mprestamos_personales_slope_3
    ,regr_slope(mactivos_margen, cliente_antiguedad) over ventana_3 as mactivos_margen_slope_3
    ,regr_slope(mpasivos_margen, cliente_antiguedad) over ventana_3 as mpasivos_margen_slope_3
    ,regr_slope(mcuentas_saldo, cliente_antiguedad) over ventana_3 as mcuentas_saldo_slope_3
    ,regr_slope(mcaja_ahorro, cliente_antiguedad) over ventana_3 as mcaja_ahorro_slope_3
    ,regr_slope(internet, cliente_antiguedad) over ventana_3 as internet_slope_3
    ,regr_slope(tcallcenter, cliente_antiguedad) over ventana_3 as tcallcenter_slope_3
    ,regr_slope(mpagomiscuentas, cliente_antiguedad) over ventana_3 as mpagomiscuent

In [44]:
%%sql
CREATE OR REPLACE TABLE competencia_01_DQ_regr_slope AS
select
    numero_de_cliente
    ,foto_mes 
    ,regr_slope(mpayroll, cliente_antiguedad) over ventana_3 as mpayroll_slope_3
    ,regr_slope(ctrx_quarter, cliente_antiguedad) over ventana_3 as ctrx_quarter_slope_3
    ,regr_slope(mprestamos_personales, cliente_antiguedad) over ventana_3 as mprestamos_personales_slope_3
    ,regr_slope(mactivos_margen, cliente_antiguedad) over ventana_3 as mactivos_margen_slope_3
    ,regr_slope(mpasivos_margen, cliente_antiguedad) over ventana_3 as mpasivos_margen_slope_3
    ,regr_slope(mcuentas_saldo, cliente_antiguedad) over ventana_3 as mcuentas_saldo_slope_3
    ,regr_slope(mcaja_ahorro, cliente_antiguedad) over ventana_3 as mcaja_ahorro_slope_3
    ,regr_slope(internet, cliente_antiguedad) over ventana_3 as internet_slope_3
    ,regr_slope(tcallcenter, cliente_antiguedad) over ventana_3 as tcallcenter_slope_3
    ,regr_slope(mpagomiscuentas, cliente_antiguedad) over ventana_3 as mpagomiscuentas_slope_3
    ,regr_slope(ccaja_ahorro, cliente_antiguedad) over ventana_3 as ccaja_ahorro_slope_3
    ,regr_slope(mcomisiones_mantenimiento, cliente_antiguedad) over ventana_3 as mcomisiones_mantenimiento_slope_3
    ,regr_slope(ccomisiones_mantenimiento, cliente_antiguedad) over ventana_3 as ccomisiones_mantenimiento_slope_3
from competencia_01_DQ
window ventana_3 as (partition by numero_de_cliente order by foto_mes rows between 3 preceding and current row)


Unnamed: 0,Success


In [45]:
%%sql
select 
    *
from competencia_01_DQ_regr_slope

Unnamed: 0,numero_de_cliente,foto_mes,mpayroll_slope_3,ctrx_quarter_slope_3,mprestamos_personales_slope_3,mactivos_margen_slope_3,mpasivos_margen_slope_3,mcuentas_saldo_slope_3,mcaja_ahorro_slope_3,internet_slope_3,tcallcenter_slope_3,mpagomiscuentas_slope_3,ccaja_ahorro_slope_3,mcomisiones_mantenimiento_slope_3,ccomisiones_mantenimiento_slope_3
0,249223005,202101,,,,,,,,,,,,,
1,249223005,202102,16316.430,26.0,0.0,958.030,-160.780,10385.810,-5887.650,0.0,0.0,0.0,0.0,0.0,0.0
2,249223005,202103,5164.135,9.5,0.0,315.990,-62.735,-2649.690,-2105.080,0.0,0.5,0.0,0.0,0.0,0.0
3,249223005,202104,8159.975,2.9,0.0,202.097,-86.253,815.881,-106.823,0.0,0.1,0.0,0.0,0.0,0.0
4,249223005,202105,6988.969,-11.8,0.0,-57.591,84.583,-3422.456,8390.396,0.0,-0.1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981941,1541018080,202104,-443.596,-21.4,0.0,-76.472,-1032.619,22200.516,4729.535,0.0,0.0,0.0,0.0,0.0,0.0
981942,1541018080,202105,658.243,-16.9,0.0,296.678,-501.846,15068.193,4163.844,0.0,0.0,0.0,0.0,0.0,0.0
981943,1541018080,202106,24321.358,4.7,0.0,516.806,400.665,36037.387,28275.512,0.0,0.0,0.0,0.0,0.0,0.0
981944,1542134813,202101,,,,,,,,,,,,,


In [46]:
nuevos_features = ""
for campo in campos:
  nuevos_features +=  f"\n    ,percent_rank() OVER (ORDER BY {campo}) as percent_rank_{campo}"


consulta_sql = f"""
CREATE OR REPLACE TABLE competencia_01_percent_rank AS
select
    numero_de_cliente
    , foto_mes 
    {nuevos_features.strip()}
from competencia_01

"""

print(consulta_sql)


CREATE OR REPLACE TABLE competencia_01_percent_rank AS
select
    numero_de_cliente
    , foto_mes 
    ,percent_rank() OVER (ORDER BY mpayroll) as percent_rank_mpayroll
    ,percent_rank() OVER (ORDER BY ctrx_quarter) as percent_rank_ctrx_quarter
    ,percent_rank() OVER (ORDER BY mprestamos_personales) as percent_rank_mprestamos_personales
    ,percent_rank() OVER (ORDER BY mactivos_margen) as percent_rank_mactivos_margen
    ,percent_rank() OVER (ORDER BY mpasivos_margen) as percent_rank_mpasivos_margen
    ,percent_rank() OVER (ORDER BY mcuentas_saldo) as percent_rank_mcuentas_saldo
    ,percent_rank() OVER (ORDER BY mcaja_ahorro) as percent_rank_mcaja_ahorro
    ,percent_rank() OVER (ORDER BY internet) as percent_rank_internet
    ,percent_rank() OVER (ORDER BY tcallcenter) as percent_rank_tcallcenter
    ,percent_rank() OVER (ORDER BY mpagomiscuentas) as percent_rank_mpagomiscuentas
    ,percent_rank() OVER (ORDER BY ccaja_ahorro) as percent_rank_ccaja_ahorro
    ,percent_rank()

In [47]:
%%sql

CREATE OR REPLACE TABLE competencia_01_DQ_percent_rank AS
select
    numero_de_cliente
    , foto_mes 
    ,percent_rank() OVER (ORDER BY mpayroll) as percent_rank_mpayroll
    ,percent_rank() OVER (ORDER BY ctrx_quarter) as percent_rank_ctrx_quarter
    ,percent_rank() OVER (ORDER BY mprestamos_personales) as percent_rank_mprestamos_personales
    ,percent_rank() OVER (ORDER BY mactivos_margen) as percent_rank_mactivos_margen
    ,percent_rank() OVER (ORDER BY mpasivos_margen) as percent_rank_mpasivos_margen
    ,percent_rank() OVER (ORDER BY mcuentas_saldo) as percent_rank_mcuentas_saldo
    ,percent_rank() OVER (ORDER BY mcaja_ahorro) as percent_rank_mcaja_ahorro
    ,percent_rank() OVER (ORDER BY internet) as percent_rank_internet
    ,percent_rank() OVER (ORDER BY tcallcenter) as percent_rank_tcallcenter
    ,percent_rank() OVER (ORDER BY mpagomiscuentas) as percent_rank_mpagomiscuentas
    ,percent_rank() OVER (ORDER BY ccaja_ahorro) as percent_rank_ccaja_ahorro
    ,percent_rank() OVER (ORDER BY mcomisiones_mantenimiento) as percent_rank_mcomisiones_mantenimiento
    ,percent_rank() OVER (ORDER BY ccomisiones_mantenimiento) as percent_rank_ccomisiones_mantenimiento
from competencia_01_DQ
WHERE foto_mes = 202104 OR foto_mes = 202106

Unnamed: 0,Success


In [49]:
%%sql 
create or replace table competencia_01_DQ_fe as
SELECT * 
FROM (
    SELECT * 
    FROM competencia_01_DQ
    WHERE foto_mes = 202104 OR foto_mes = 202106
) AS base

LEFT JOIN competencia_01_NTILE_10 as ntile_10
ON base.numero_de_cliente = ntile_10.numero_de_cliente 
    AND base.foto_mes = ntile_10.foto_mes  

LEFT JOIN competencia_01_NTILE_20 as ntile_20
ON ntile_10.numero_de_cliente = ntile_20.numero_de_cliente 
    AND ntile_10.foto_mes = ntile_20.foto_mes  

LEFT JOIN competencia_01_NTILE_30 as ntile_30
ON ntile_20.numero_de_cliente = ntile_30.numero_de_cliente 
    AND ntile_20.foto_mes = ntile_30.foto_mes  

LEFT JOIN competencia_01_DQ_avg as avg 
ON ntile_30.numero_de_cliente = avg.numero_de_cliente 
    AND ntile_30.foto_mes = avg.foto_mes  
    
LEFT JOIN competencia_01_DQ_regr_slope as reg 
ON avg.numero_de_cliente = reg.numero_de_cliente 
    AND avg.foto_mes = reg.foto_mes

LEFT JOIN competencia_01_DQ_percent_rank as p_rank
ON reg.numero_de_cliente = p_rank.numero_de_cliente 
    AND reg.foto_mes = p_rank.foto_mes

Unnamed: 0,Success


In [50]:
%%sql
COPY competencia_01_DQ_fe TO '../../../datasets/competencia_01_DQ_fe_2.csv' (FORMAT CSV, HEADER TRUE)

Unnamed: 0,Success


In [52]:
%%sql
select 
    *
from competencia_01_DQ_fe
limit 4

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,percent_rank_mactivos_margen,percent_rank_mpasivos_margen,percent_rank_mcuentas_saldo,percent_rank_mcaja_ahorro,percent_rank_internet,percent_rank_tcallcenter,percent_rank_mpagomiscuentas,percent_rank_ccaja_ahorro,percent_rank_mcomisiones_mantenimiento,percent_rank_ccomisiones_mantenimiento
0,345821831,202104,1,0,0,50,259,3164.74,143431.55,359.89,...,0.849902,0.805329,0.607411,0.842248,0.0,0.0,0.0,0.0,0.713486,0.669144
1,345821831,202106,1,0,0,50,261,4650.94,130432.35,2305.66,...,0.817215,0.783053,0.444047,0.813615,0.0,0.0,0.0,0.0,0.856909,0.669144
2,345847792,202104,1,0,0,54,89,3762.77,35013.28,4724.67,...,0.287359,0.562184,0.01768,8.8e-05,0.0,0.0,0.0,0.0,0.960418,0.669144
3,345847792,202106,1,0,0,54,91,3541.68,40052.31,4201.19,...,0.306093,0.60577,0.017558,8.8e-05,0.0,0.0,0.0,0.0,0.960418,0.669144
