# Crear la clase  Target



In [1]:
import duckdb
import pandas as pd
import os
import time

In [2]:
# Ruta absoluta al dataset crudo comprimido
DATASET_PATH = "/home/sanmartinofacundo/datasets/competencia_02_crudo.csv.gz"

In [3]:
conn = duckdb.connect(database=':memory:')

conn.execute(f"""
    CREATE TABLE competencia_02_crudo AS
    SELECT * FROM read_csv_auto('{DATASET_PATH}')
""")
print("✅ Tabla creada en memoria")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Tabla creada en memoria


In [4]:
start = time.time()
# Crear tabla etiquetada con la misma lógica SQL
conn.execute("""
    CREATE OR REPLACE TABLE competencia_02 AS
    WITH datos AS (
        SELECT *,
            LEAD(foto_mes, 1) OVER(PARTITION BY numero_de_cliente ORDER BY foto_mes) AS foto_mes_mas1,
            LEAD(foto_mes, 2) OVER(PARTITION BY numero_de_cliente ORDER BY foto_mes) AS foto_mes_mas2
        FROM competencia_02_crudo
    ),
    maximo_mes AS (
        SELECT MAX(foto_mes) AS maximo_foto_mes FROM competencia_02_crudo
    ),
    etiquetado AS (
        SELECT
            d.*,
            CASE
                WHEN d.foto_mes = m.maximo_foto_mes THEN NULL
                WHEN d.foto_mes_mas1 IS NULL THEN 'BAJA+1'
                WHEN d.foto_mes_mas2 IS NULL
                     AND d.foto_mes <> m.maximo_foto_mes
                     AND d.foto_mes <> m.maximo_foto_mes - 1 THEN 'BAJA+2'
                ELSE 'CONTINUA'
            END AS clase_ternaria
        FROM datos d
        CROSS JOIN maximo_mes m
    )
    SELECT * EXCLUDE (foto_mes_mas1, foto_mes_mas2)
    FROM etiquetado;
""")

end = time.time()
print(f"✅ Tabla 'competencia_02' creada en memoria en {end - start:.2f} segundos")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Tabla 'competencia_02' creada en memoria en 6.14 segundos


In [5]:
# Ejecutar la consulta SQL en DuckDB y obtener un DataFrame
df_resumen = conn.execute("""
    SELECT 
        foto_mes, 
        clase_ternaria, 
        COUNT(*) AS cantidad
    FROM competencia_02
    GROUP BY foto_mes, clase_ternaria
    ORDER BY foto_mes, clase_ternaria
""").df()

# Mostrar resultado
df_resumen.head(15)


Unnamed: 0,foto_mes,clase_ternaria,cantidad
0,201901,BAJA+1,635
1,201901,BAJA+2,720
2,201901,CONTINUA,122918
3,201902,BAJA+1,723
4,201902,BAJA+2,693
5,201902,CONTINUA,123985
6,201903,BAJA+1,694
7,201903,BAJA+2,738
8,201903,CONTINUA,124535
9,201904,BAJA+1,743


In [6]:
# Ejecutar la consulta PIVOT en DuckDB y obtener un DataFrame
df_pivot = conn.execute("""
    PIVOT competencia_02
    ON clase_ternaria
    USING count(numero_de_cliente)
    GROUP BY foto_mes
""").df()

# Mostrar resultado
df_pivot.head(36)

Unnamed: 0,foto_mes,BAJA+1,BAJA+2,CONTINUA
0,201901,635,720,122918
1,201902,723,693,123985
2,201903,694,738,124535
3,201904,743,502,125293
4,201905,505,681,126016
5,201906,685,596,127453
6,201907,599,670,128999
7,201908,671,567,130905
8,201909,570,571,132616
9,201910,581,609,134820


In [7]:
# Guardar en bucket de GCP (b1)
dataset_path_bucket = "/home/sanmartinofacundo/buckets/b1/competencia_02.csv.gz"

os.makedirs(os.path.dirname(dataset_path_bucket), exist_ok=True)

conn.execute(f"""
    COPY competencia_02 
    TO '{dataset_path_bucket}' 
    (FORMAT CSV, HEADER, COMPRESSION GZIP);
""")

print("✅ Guardado directamente en el bucket montado b1")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Guardado directamente en el bucket montado b1
