In [10]:
# Cargar sell-in.txt (puede ser un archivo grande, leer solo columnas necesarias)
import pandas as pd
sellin_cols = ['periodo', 'customer_id', 'product_id', 'plan_precios_cuidados', 'cust_request_qty', 'cust_request_tn', 'tn']
df_sellin = pd.read_csv('sell-in.txt', sep='\t', usecols=sellin_cols)
df_sellin.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [28]:
# Exploro los mejores productos según tn
top_products = df_sellin.copy()
top_products = df_sellin.groupby('product_id')['tn'].sum().sort_values(ascending=False)
top_products = top_products.reset_index()
top_products.columns = ['product_id', 'tn']
top_products.head()

Unnamed: 0,product_id,tn
0,20001,50340.39558
1,20002,36337.25439
2,20003,32004.15274
3,20004,24178.15379
4,20005,23191.21852


In [32]:
# Agrego una columna para calcular el porcentaje del total de tn por producto
top_products['tn_percentage'] = top_products['tn'] / top_products['tn'].sum() * 100

# Agrego una columna contando la cantidad de peridos únicos por producto
top_products['periods_count'] = df_sellin.groupby('product_id')['periodo'].nunique().values
top_products = top_products.sort_values(by='tn_percentage', ascending=False)
top_products = top_products.reset_index(drop=True)
top_products = top_products[['product_id', 'tn', 'tn_percentage', 'periods_count']]
top_products.head(30)

Unnamed: 0,product_id,tn,tn_percentage,periods_count
0,20001,50340.39558,3.799308,36
1,20002,36337.25439,2.742458,36
2,20003,32004.15274,2.415429,36
3,20004,24178.15379,1.824782,36
4,20005,23191.21852,1.750296,36
5,20007,22018.45234,1.661784,36
6,20006,21088.76007,1.591618,36
7,20008,19948.29352,1.505545,36
8,20010,18671.07918,1.40915,36
9,20012,17813.59935,1.344434,36


In [33]:
top_products.to_excel('top_products.xlsx', index=False)

In [1]:
# Leo archivo parquet
import pandas as pd
df_full = pd.read_parquet("df_full.parquet")

In [4]:
df_full.shape

(17173448, 98)

In [5]:
# Suponemos que ya existe df_full con todas las features creadas
# Y una columna tn_t_plus_2 como variable objetivo

import lightgbm as lgb
import numpy as np

# 1. Definir productos y clientes importantes por regla de Pareto (80/20)
sellin_por_producto = df_full.groupby('product_id')['tn'].sum().sort_values(ascending=False)
limite_producto = 0.8 * sellin_por_producto.sum()
df_prod_importantes = sellin_por_producto.cumsum() <= limite_producto
productos_importantes = df_prod_importantes[df_prod_importantes].index.tolist()

sellin_por_cliente = df_full.groupby('customer_id')['tn'].sum().sort_values(ascending=False)
limite_cliente = 0.8 * sellin_por_cliente.sum()
df_clientes_importantes = sellin_por_cliente.cumsum() <= limite_cliente
clientes_importantes = df_clientes_importantes[df_clientes_importantes].index.tolist()

# 2. Dividir en 4 subconjuntos de datos
df_1 = df_full[df_full['product_id'].isin(productos_importantes) & df_full['customer_id'].isin(clientes_importantes)]
df_2 = df_full[df_full['product_id'].isin(productos_importantes) & ~df_full['customer_id'].isin(clientes_importantes)]
df_3 = df_full[~df_full['product_id'].isin(productos_importantes) & df_full['customer_id'].isin(clientes_importantes)]
df_4 = df_full[~df_full['product_id'].isin(productos_importantes) & ~df_full['customer_id'].isin(clientes_importantes)]

In [9]:
df_4.shape

(12302779, 98)

In [None]:
# Suponemos que ya existe df_full con todas las features creadas
# Y una columna tn_t_plus_2 como variable objetivo

import lightgbm as lgb
import numpy as np

# 3. Agregación para los subconjuntos 2, 3 y 4
df_2_agg = df_2.groupby(['product_id', 'periodo'], as_index=False).agg({**{col: 'sum' for col in feature_cols}, 'tn': 'sum', 'tn_t_plus_2': 'sum'})
df_3_agg = df_3.groupby(['customer_id', 'periodo'], as_index=False).agg({**{col: 'sum' for col in feature_cols}, 'tn': 'sum', 'tn_t_plus_2': 'sum'})
df_4_agg = df_4.groupby(['periodo'], as_index=False).agg({**{col: 'sum' for col in feature_cols}, 'tn': 'sum', 'tn_t_plus_2': 'sum'})

# 4. Entrenamiento para los 4 modelos
model_1 = lgb.LGBMRegressor(**best_params_1)
model_1.fit(df_1[feature_cols], df_1['tn_t_plus_2'], sample_weight=df_1['tn'].clip(0.1))

model_2 = lgb.LGBMRegressor(**best_params_2)
model_2.fit(df_2_agg[feature_cols], df_2_agg['tn_t_plus_2'], sample_weight=df_2_agg['tn'].clip(0.1))

model_3 = lgb.LGBMRegressor(**best_params_3)
model_3.fit(df_3_agg[feature_cols], df_3_agg['tn_t_plus_2'], sample_weight=df_3_agg['tn'].clip(0.1))

model_4 = lgb.LGBMRegressor(**best_params_4)
model_4.fit(df_4_agg[feature_cols], df_4_agg['tn_t_plus_2'], sample_weight=df_4_agg['tn'].clip(0.1))

# 5. Guardar modelos o generar predicciones según próximo paso
