In [1]:
import pandas as pd

# Cargar sell-in.txt (puede ser un archivo grande, leer solo columnas necesarias)
sellin_cols = ['periodo', 'customer_id', 'product_id', 'plan_precios_cuidados', 'cust_request_qty', 'cust_request_tn', 'tn']
df_sellin = pd.read_csv('sell-in.txt', sep='\t', usecols=sellin_cols)
df_sellin.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [2]:
# Contar valores únicos de customer_id
df_sellin['customer_id'].nunique()
# Contar valores únicos de product_id
#df_sellin['product_id'].nunique()
# Contar valores únicos de periodo
#df_sellin['periodo'].nunique()


597

In [46]:
# Si 'periodo' es tipo string o int, conviértelo a datetime para mayor facilidad
df_sellin['periodo'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')
df_sellin= df_sellin.sort_values(['product_id', 'customer_id', 'periodo']).reset_index(drop=True)


In [47]:
# Agrupo por 'product_id' y período, y calculo la suma de 'cust_request_qty' y 'tn'
df_agg = df_sellin.groupby(['product_id', 'periodo']).agg({'cust_request_qty': 'sum','tn': 'sum'}).reset_index()   
df_agg.head() 

Unnamed: 0,product_id,periodo,cust_request_qty,tn
0,20001,2017-01-01,479,934.77222
1,20001,2017-02-01,432,798.0162
2,20001,2017-03-01,509,1303.35771
3,20001,2017-04-01,279,1069.9613
4,20001,2017-05-01,701,1502.20132


Entrenamiento con AutoGluon

Preparar los datos de entrenamiento y test
Entrenamiento: Usa todos los datos donde tn_t_plus_2 no es NaN y el período es menor a 201912 (para no usar datos del futuro).
Test: Filtra las filas donde el período es 201912 (diciembre 2019), ya que para esas filas queremos predecir tn en 202002 (febrero 2020).

Training con AutoGluon

In [49]:
df_agg = df_agg.rename(columns={
    'product_id': 'item_id',
    'periodo': 'timestamp'
})

print(df_agg.columns.tolist())

['item_id', 'timestamp', 'cust_request_qty', 'tn']


In [23]:
import warnings
warnings.filterwarnings("ignore")
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor



# Entrenar el modelo con AutoGluon
predictor = TimeSeriesPredictor(
    target='tn', 
    prediction_length=2,
    freq='MS',
    eval_metric='WQL'
).fit(
    train_data=df_agg, 
    num_val_windows=2,
    val_step_size=1
)



Beginning AutoGluon training...
AutoGluon will save models to '/Users/fernandopedroarena/Documents/Documents/LABO III/AutogluonModels/ag-20250705_214310'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:43 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8132
CPU Count:          10
GPU Count:          0
Memory Avail:       3.46 GB / 16.00 GB (21.6%)
Disk Space Avail:   109.60 GB / 228.27 GB (48.0%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'val_step_size': 1,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampl

Monitoring

In [None]:
# Entender la contribución de cada modelo

predictor.leaderboard()


In [None]:
# Instalar bokeh
#pip install bokeh
predictor.fit_summary()

In [None]:
predictor.feature_importance()

### Predicción Feb. 2020

In [37]:
predictions = predictor.predict(df_agg)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [None]:
# Tomar solo item_id y la predicción 'mean'
# Filtrar solo febrero 2020
resultado = predictions['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']
resultado.head()


Unnamed: 0,product_id,tn
1,20001,1390.844229
3,20002,1096.937644
5,20003,718.693399
7,20004,549.85605
9,20005,534.158102


---

#### Predicciones para diciembre 2019 (validación para stacking)

In [None]:
# Filtro df_agg hasta octubre 2019
df_agg_v = df_agg[df_agg['timestamp'] <= '2019-10-01']
#df_agg_v.head()

Unnamed: 0,item_id,timestamp,cust_request_qty,tn
0,20001,2017-01-01,479,934.77222
1,20001,2017-02-01,432,798.0162
2,20001,2017-03-01,509,1303.35771
3,20001,2017-04-01,279,1069.9613
4,20001,2017-05-01,701,1502.20132


In [51]:
# Entrenar el modelo con AutoGluon
predictor2 = TimeSeriesPredictor(
    target='tn', 
    prediction_length=2,
    freq='MS',
    eval_metric='WQL'
).fit(
    train_data=df_agg_v, 
    num_val_windows=2,
    val_step_size=1
)

Beginning AutoGluon training...
AutoGluon will save models to '/Users/fernandopedroarena/Documents/Documents/LABO III/AutogluonModels/ag-20250705_222720'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:43 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8132
CPU Count:          10
GPU Count:          0
Memory Avail:       3.94 GB / 16.00 GB (24.6%)
Disk Space Avail:   109.23 GB / 228.27 GB (47.9%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'val_step_size': 1,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampl

In [52]:
predictions2 = predictor2.predict(df_agg_v)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [53]:
# Tomar solo item_id y la predicción 'mean'
# Filtrar solo diciembre 2019
resultado2 = predictions2['mean'].reset_index()
resultado2 = resultado2[resultado2['timestamp'] == '2019-12-01']

# Renombrar columnas
resultado2 = resultado2[['item_id', 'mean']]
resultado2.columns = ['product_id', 'tn']
resultado2.head()

Unnamed: 0,product_id,tn
1,20001,1515.573302
3,20002,1239.561978
5,20003,942.939805
7,20004,768.731241
9,20005,730.530316


In [54]:
# Exportar a CSV 
resultado2.to_csv('ridge_val_autogluon.csv', index=False)

---

Archivo para Kaggle

In [42]:
# Cargar product_id_apredecir201912.txt
df_ids = pd.read_csv('product_id_apredecir201912.txt')
df_ids.head()

# Transformar product_id a string
#df_ids['product_id'] = df_ids['product_id'].astype(str)

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


In [43]:
# Merge para obtener los product_id que se deben predecir
resultado = df_ids.merge(resultado, on='product_id', how='left')
#predictions_v1.shape
resultado.head()

# Exportar a CSV 
resultado.to_csv('submission_AGP.csv', index=False)