In [1]:
import pandas as pd

# Cargar sell-in.txt (puede ser un archivo grande, leer solo columnas necesarias)
sellin_cols = ['periodo', 'customer_id', 'product_id', 'plan_precios_cuidados', 'cust_request_qty', 'cust_request_tn', 'tn']
df_sellin = pd.read_csv('sell-in.txt', sep='\t', usecols=sellin_cols)
df_sellin.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [2]:
# Contar valores únicos de customer_id
df_sellin['customer_id'].nunique()
# Contar valores únicos de product_id
#df_sellin['product_id'].nunique()
# Contar valores únicos de periodo
#df_sellin['periodo'].nunique()


597

In [3]:
# Si 'periodo' es tipo string o int, conviértelo a datetime para mayor facilidad
df_sellin['periodo'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')
df_sellin= df_sellin.sort_values(['product_id', 'customer_id', 'periodo']).reset_index(drop=True)


Hacer el producto cartesiano de producto-cliente-período y agregar 0s.

In [4]:
# 1. Listas de valores únicos
product_ids = df_sellin['product_id'].unique()
customer_ids = df_sellin['customer_id'].unique()
periodos = pd.date_range(
    start=df_sellin['periodo'].min(), 
    end=df_sellin['periodo'].max(), 
    freq='MS'  # Monthly start frequency
)

# 2. Producto cartesiano
cartesian = pd.MultiIndex.from_product(
    [product_ids, customer_ids, periodos], 
    names=['product_id', 'customer_id', 'periodo']
).to_frame(index=False)

In [5]:
# 3. Filtrar productos activos
periodo_producto = df_sellin.groupby('product_id')['periodo'].agg(['min', 'max']).reset_index()
periodo_producto.columns = ['product_id', 'periodo_min_producto', 'periodo_max_producto']

# 4. Filtrar clientes activos
periodo_customer = df_sellin.groupby('customer_id')['periodo'].agg(['min', 'max']).reset_index()
periodo_customer.columns = ['customer_id', 'periodo_min_customer', 'periodo_max_customer']

In [6]:
# 5. Merge para filtrar combinaciones válidas
cartesian = cartesian.merge(periodo_producto, on='product_id', how='left')
cartesian = cartesian.merge(periodo_customer, on='customer_id', how='left')

# 6. Filtrar combinaciones donde el periodo esté dentro del rango activo
cartesian = cartesian[
    (cartesian['periodo'] >= cartesian['periodo_min_producto']) & 
    (cartesian['periodo'] <= cartesian['periodo_max_producto']) &
    (cartesian['periodo'] >= cartesian['periodo_min_customer']) #&
    #(cartesian['periodo'] <= cartesian['periodo_max_customer'])
].copy()

# 7. Merge con el dataset original
df_final = cartesian.merge(
    df_sellin, 
    on=['product_id', 'customer_id', 'periodo'], 
    how='left'
)

In [7]:
df_final.shape

(17173448, 11)

In [8]:
# 8. Completar las ventas y cualquier otra variable faltante con 0
df_final['tn'] = df_final['tn'].fillna(0)
df_final['plan_precios_cuidados'] = df_final['plan_precios_cuidados'].fillna(0)
df_final['cust_request_qty'] = df_final['cust_request_qty'].fillna(0)
df_final['cust_request_tn'] = df_final['cust_request_tn'].fillna(0)
# Podés extender esto a otras variables que consideres relevantes.

# 9. Revisar el resultado
print(df_final.head())

   product_id  customer_id    periodo periodo_min_producto  \
0       20001        10001 2017-01-01           2017-01-01   
1       20001        10001 2017-02-01           2017-01-01   
2       20001        10001 2017-03-01           2017-01-01   
3       20001        10001 2017-04-01           2017-01-01   
4       20001        10001 2017-05-01           2017-01-01   

  periodo_max_producto periodo_min_customer periodo_max_customer  \
0           2019-12-01           2017-01-01           2019-12-01   
1           2019-12-01           2017-01-01           2019-12-01   
2           2019-12-01           2017-01-01           2019-12-01   
3           2019-12-01           2017-01-01           2019-12-01   
4           2019-12-01           2017-01-01           2019-12-01   

   plan_precios_cuidados  cust_request_qty  cust_request_tn         tn  
0                    0.0              11.0         99.43861   99.43861  
1                    0.0              23.0        198.84365  198.84365  

Creación de variable target: tn_t_plus_2

In [9]:
# Creamos la variable objetivo: tn en t+2 para cada combinación product_id y customer_id
# df_merged['tn_t_plus_2']

# Paso 1: Crear columna con periodo +2 meses
df_final['periodo_target'] = df_final['periodo'] + pd.DateOffset(months=2)

# Paso 2: Crear DataFrame con target
target_df = df_final[['product_id', 'customer_id', 'periodo', 'tn']].copy()
target_df.rename(columns={'periodo': 'periodo_target', 'tn': 'tn_t_plus_2'}, inplace=True)

# Paso 3: Hacer el merge
df_final = df_final.merge(
    target_df,
    on=['product_id', 'customer_id', 'periodo_target'],
    how='left'
)

# Paso 4: Validar
print(df_final[['product_id', 'customer_id', 'periodo', 'tn', 'tn_t_plus_2']].head(20))




    product_id  customer_id    periodo         tn  tn_t_plus_2
0        20001        10001 2017-01-01   99.43861     92.46537
1        20001        10001 2017-02-01  198.84365     13.29728
2        20001        10001 2017-03-01   92.46537    101.00563
3        20001        10001 2017-04-01   13.29728    128.04792
4        20001        10001 2017-05-01  101.00563    101.20711
5        20001        10001 2017-06-01  128.04792     43.33930
6        20001        10001 2017-07-01  101.20711    289.35024
7        20001        10001 2017-08-01   43.33930    222.11389
8        20001        10001 2017-09-01  289.35024    111.54944
9        20001        10001 2017-10-01  222.11389    131.27150
10       20001        10001 2017-11-01  111.54944     49.61857
11       20001        10001 2017-12-01  131.27150     88.44065
12       20001        10001 2018-01-01   49.61857    214.72336
13       20001        10001 2018-02-01   88.44065    132.83419
14       20001        10001 2018-03-01  214.72336    16

Feature engeneering

In [65]:
df_full = df_final.copy()

# Elimino columnas innecesarias
df_full.drop(columns=['periodo_min_producto', 'periodo_max_producto', 'periodo_min_customer', 'periodo_max_customer', 'periodo_target'], inplace=True)
df_full.head()

Unnamed: 0,product_id,customer_id,periodo,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,tn_t_plus_2
0,20001,10001,2017-01-01,0.0,11.0,99.43861,99.43861,92.46537
1,20001,10001,2017-02-01,0.0,23.0,198.84365,198.84365,13.29728
2,20001,10001,2017-03-01,0.0,33.0,92.46537,92.46537,101.00563
3,20001,10001,2017-04-01,0.0,8.0,13.29728,13.29728,128.04792
4,20001,10001,2017-05-01,0.0,15.0,101.20711,101.00563,101.20711


In [66]:
df_full.shape

(17173448, 8)

1. Lags (valores previos)


In [67]:
for lag in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]:
    df_full[f'tn_lag_{lag}'] = df_full.groupby(['product_id', 'customer_id'])['tn'].shift(lag)

2. Rolling Mean (media móvil)
media de los últimos 3, 6, 9 y 12 meses:

In [68]:
for window in [3, 6, 9, 12, 15, 18, 21, 24]:  #15, 18, 21, 24
    df_full[f'tn_rollmean_{window}'] = (
        df_full.groupby(['product_id', 'customer_id'])['tn']
        .transform(lambda x: x.rolling(window, min_periods=1).mean())
    )

3. Rolling Sum (suma móvil)
la suma de los últimos 3 meses:

In [69]:
df_full['tn_rollsum_3'] = (
    df_full.groupby(['product_id', 'customer_id'])['tn']
    .transform(lambda x: x.rolling(3, min_periods=1).sum())
)

4. Promedio histórico por producto y cliente
Capturar el comportamiento típico de cada combinación:

In [70]:
# Promedio histórico de tn por product_id y customer_id hasta el periodo actual (excluyendo el actual)
df_full['tn_mean_hist'] = (
    df_full.groupby(['product_id', 'customer_id'])['tn']
    .transform(lambda x: x.expanding().mean().shift(1))
)

5. Lag y rolling para otras variables
Repetir la lógica para otras columnas como cust_request_qty y cust_request_tn:

In [71]:
for col in ['cust_request_qty', 'cust_request_tn']:
    for lag in [1, 2, 3]:
        df_full[f'{col}_lag_{lag}'] = df_full.groupby(['product_id', 'customer_id'])[col].shift(lag)
    
    for window in [3, 6, 12]:
        df_full[f'{col}_rollmean_{window}'] = (
            df_full.groupby(['product_id', 'customer_id'])[col]
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )

In [None]:
# Probar combinaciones de producto - cliente
#df_full[(df_full['product_id'] == 20524) & (df_final['customer_id'] == 10125)].head(10)
#df_full.head()

Unnamed: 0,product_id,customer_id,periodo,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,tn_t_plus_2,tn_lag_1,tn_lag_2,...,tn_rollsum_3,tn_mean_hist,cust_request_qty_lag_1,cust_request_qty_lag_2,cust_request_qty_lag_3,cust_request_qty_rollmean_3,cust_request_tn_lag_1,cust_request_tn_lag_2,cust_request_tn_lag_3,cust_request_tn_rollmean_3
8959408,20524,10125,2017-01-01,0.0,1.0,0.02271,0.02271,0.0,,,...,0.02271,,,,,1.0,,,,0.02271
8959409,20524,10125,2017-02-01,0.0,0.0,0.0,0.0,0.00757,0.02271,,...,0.02271,0.02271,1.0,,,0.5,0.02271,,,0.011355
8959410,20524,10125,2017-03-01,0.0,0.0,0.0,0.0,0.00757,0.0,0.02271,...,0.02271,0.011355,0.0,1.0,,0.333333,0.0,0.02271,,0.00757
8959411,20524,10125,2017-04-01,0.0,1.0,0.00757,0.00757,0.0,0.0,0.0,...,0.00757,0.00757,0.0,0.0,1.0,0.333333,0.0,0.0,0.02271,0.002523
8959412,20524,10125,2017-05-01,0.0,1.0,0.00757,0.00757,0.0,0.00757,0.0,...,0.01514,0.00757,1.0,0.0,0.0,0.666667,0.00757,0.0,0.0,0.005047
8959413,20524,10125,2017-06-01,0.0,0.0,0.0,0.0,0.0,0.00757,0.00757,...,0.01514,0.00757,1.0,1.0,0.0,0.666667,0.00757,0.00757,0.0,0.005047
8959414,20524,10125,2017-07-01,0.0,0.0,0.0,0.0,0.00757,0.0,0.00757,...,0.00757,0.006308,0.0,1.0,1.0,0.333333,0.0,0.00757,0.00757,0.002523
8959415,20524,10125,2017-08-01,0.0,0.0,0.0,0.0,0.00757,0.0,0.0,...,0.0,0.005407,0.0,0.0,1.0,0.0,0.0,0.0,0.00757,0.0
8959416,20524,10125,2017-09-01,0.0,1.0,0.00757,0.00757,0.0,0.0,0.0,...,0.00757,0.004731,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.002523
8959417,20524,10125,2017-10-01,0.0,1.0,0.00757,0.00757,0.0,0.00757,0.0,...,0.01514,0.005047,1.0,0.0,0.0,0.666667,0.00757,0.0,0.0,0.005047


In [64]:
'''
# Importo df_full
df_full = pd.read_csv('df_full_features.csv')
df_full['periodo'] = pd.to_datetime(df_full['periodo'])
df_full.head()'''

"\n# Importo df_full\ndf_full = pd.read_csv('df_full_features.csv')\ndf_full['periodo'] = pd.to_datetime(df_full['periodo'])\ndf_full.head()"

In [72]:
# Cambio el nombre de product_id a item_id
df_full['item_id'] = df_full['product_id'].astype(str) + '_' + df_full['customer_id'].astype(str)
df_full.rename(columns={'periodo': 'timestamp'}, inplace=True)
df_full.drop(columns=['product_id', 'customer_id'], inplace=True)
df_full.head()

Unnamed: 0,timestamp,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,tn_t_plus_2,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,...,cust_request_qty_rollmean_3,cust_request_qty_rollmean_6,cust_request_qty_rollmean_12,cust_request_tn_lag_1,cust_request_tn_lag_2,cust_request_tn_lag_3,cust_request_tn_rollmean_3,cust_request_tn_rollmean_6,cust_request_tn_rollmean_12,item_id
0,2017-01-01,0.0,11.0,99.43861,99.43861,92.46537,,,,,...,11.0,11.0,11.0,,,,99.43861,99.43861,99.43861,20001_10001
1,2017-02-01,0.0,23.0,198.84365,198.84365,13.29728,99.43861,,,,...,17.0,17.0,17.0,99.43861,,,149.14113,149.14113,149.14113,20001_10001
2,2017-03-01,0.0,33.0,92.46537,92.46537,101.00563,198.84365,99.43861,,,...,22.333333,22.333333,22.333333,198.84365,99.43861,,130.24921,130.24921,130.24921,20001_10001
3,2017-04-01,0.0,8.0,13.29728,13.29728,128.04792,92.46537,198.84365,99.43861,,...,21.333333,18.75,18.75,92.46537,198.84365,99.43861,101.535433,101.011227,101.011227,20001_10001
4,2017-05-01,0.0,15.0,101.20711,101.00563,101.20711,13.29728,92.46537,198.84365,99.43861,...,18.666667,18.0,18.0,13.29728,92.46537,198.84365,68.98992,101.050404,101.050404,20001_10001


Entrenamiento con AutoGluon

Preparar los datos de entrenamiento y test
Entrenamiento: Usa todos los datos donde tn_t_plus_2 no es NaN y el período es menor a 201912 (para no usar datos del futuro).
Test: Filtra las filas donde el período es 201912 (diciembre 2019), ya que para esas filas queremos predecir tn en 202002 (febrero 2020).

In [73]:
# Lista de features: incluye todas las columnas que empiezan con los prefijos de los features
feature_cols = [
    col for col in df_full.columns
    if (
        col.startswith('tn_lag_') or
        col.startswith('tn_roll') or
        col.startswith('cust_request_qty_lag_') or
        col.startswith('cust_request_qty_roll') or
        col.startswith('cust_request_tn_lag_') or
        col.startswith('cust_request_tn_roll') or
        col == 'plan_precios_cuidados'  
    )
]

In [None]:
'''# Entrenamiento y validación
train = df_full[df_full['periodo'] < pd.to_datetime('2019-10-01')]
train = train[train['tn_t_plus_2'].notnull()] # Filtro con target válido
valid = df_full[df_full['periodo'] == pd.to_datetime('2019-10-01')]
valid = valid[valid['tn_t_plus_2'].notnull()] # Filtro con target válido
test = df_full[df_full['periodo'] == pd.to_datetime('2019-12-01')]

# Seleccionar variables de entrada y target
train_data = train[feature_cols + ['tn_t_plus_2']]
valid_data = valid[feature_cols + ['tn_t_plus_2']]
'''

In [None]:
# Instalar AutoGluon
#!pip install autogluon
#!pip uninstall -y autogluon


In [None]:
#!pip install numpy==1.23.5 scipy==1.9.3


Training con AutoGluon

In [74]:
import warnings
warnings.filterwarnings("ignore")
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor



# Entrenar el modelo con AutoGluon
predictor = TimeSeriesPredictor(
    target='tn', 
    prediction_length=2,
    freq='M',
    eval_metric='WQL'
).fit(
    train_data=df_full, 
    num_val_windows=3,
    val_step_size=1
)



Frequency 'M' stored as 'ME'
Beginning AutoGluon training...
AutoGluon will save models to '/Users/fernandopedroarena/Documents/Documents/LABO III/AutogluonModels/ag-20250624_221449'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:43 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T8132
CPU Count:          10
GPU Count:          0
Memory Avail:       5.16 GB / 16.00 GB (32.3%)
Disk Space Avail:   112.18 GB / 228.27 GB (49.1%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'ME',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 3,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'val_step_size': 1,
 'verbosity': 2}

train_data with frequ

Monitoring

In [75]:
# Entender la contribución de cada modelo

predictor.leaderboard()


Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.377515,2161.506158,618.367109,13
1,TemporalFusionTransformer,-0.377515,2161.506158,1248.346686,9
2,ChronosFineTuned[bolt_small],-0.652207,283.754332,1227.323815,8
3,DirectTabular,-0.657187,43.014245,170.972487,3
4,DeepAR,-0.682906,434.120229,3300.522453,10
5,ChronosZeroShot[bolt_base],-0.684461,1132.841827,2098.666332,7
6,TiDE,-0.74524,205.046303,4906.54036,12
7,PatchTST,-0.750274,146.86325,535.141823,11
8,RecursiveTabular,-0.939097,14.281476,107.941354,2
9,NPTS,-0.996795,239.508562,563.39183,4


In [76]:
# Instalar bokeh
#pip install bokeh
predictor.fit_summary()

****************** Summary of fit() ******************
Estimated performance of each model:
                           model  score_val  pred_time_val  fit_time_marginal  \
0               WeightedEnsemble  -0.377515    2161.506158         618.367109   
1      TemporalFusionTransformer  -0.377515    2161.506158        1248.346686   
2   ChronosFineTuned[bolt_small]  -0.652207     283.754332        1227.323815   
3                  DirectTabular  -0.657187      43.014245         170.972487   
4                         DeepAR  -0.682906     434.120229        3300.522453   
5     ChronosZeroShot[bolt_base]  -0.684461    1132.841827        2098.666332   
6                           TiDE  -0.745240     205.046303        4906.540360   
7                       PatchTST  -0.750274     146.863250         535.141823   
8               RecursiveTabular  -0.939097      14.281476         107.941354   
9                           NPTS  -0.996795     239.508562         563.391830   
10               

{'model_types': {'SeasonalNaive': 'MultiWindowBacktestingModel',
  'RecursiveTabular': 'MultiWindowBacktestingModel',
  'DirectTabular': 'MultiWindowBacktestingModel',
  'NPTS': 'MultiWindowBacktestingModel',
  'DynamicOptimizedTheta': 'MultiWindowBacktestingModel',
  'AutoETS': 'MultiWindowBacktestingModel',
  'ChronosZeroShot[bolt_base]': 'MultiWindowBacktestingModel',
  'ChronosFineTuned[bolt_small]': 'MultiWindowBacktestingModel',
  'TemporalFusionTransformer': 'MultiWindowBacktestingModel',
  'DeepAR': 'MultiWindowBacktestingModel',
  'PatchTST': 'MultiWindowBacktestingModel',
  'TiDE': 'MultiWindowBacktestingModel',
  'WeightedEnsemble': 'GreedyEnsemble'},
 'model_performance': {'SeasonalNaive': -1.0352972329320778,
  'RecursiveTabular': -0.9390974972294966,
  'DirectTabular': -0.6571866668189991,
  'NPTS': -0.996795487619511,
  'DynamicOptimizedTheta': -1.0352972329320778,
  'AutoETS': -1.0352972329320778,
  'ChronosZeroShot[bolt_base]': -0.6844611506709013,
  'ChronosFineTuned[

In [77]:
predictor.feature_importance()

Computing feature importance


Unnamed: 0,importance,stdev,n,p99_low,p99_high
plan_precios_cuidados,-1e-06,4e-06,5.0,-1e-05,8e-06
cust_request_qty,0.072728,0.23193,5.0,-0.40482,0.550275
cust_request_tn,0.096041,0.090275,5.0,-0.089837,0.281919
tn_t_plus_2,1.172147,1.140201,5.0,-1.175543,3.519837
tn_lag_1,-0.018902,0.068052,5.0,-0.159022,0.121218
tn_lag_2,-0.02221,0.079021,5.0,-0.184917,0.140496
tn_lag_3,0.036667,0.061872,5.0,-0.090729,0.164063
tn_lag_4,-0.00311,0.014187,5.0,-0.032321,0.026101
tn_lag_5,0.005996,0.011846,5.0,-0.018395,0.030387
tn_lag_6,-0.00763,0.011877,5.0,-0.032084,0.016825


Predicting

In [78]:
# Última fecha en df
last_date = pd.to_datetime(df_full['timestamp']).max()

# Crear fechas futuras (por ejemplo, los próximos 2 meses)
future_dates = pd.date_range(start=last_date + pd.offsets.MonthBegin(1), periods=2, freq='M')

# Lista de productos únicos
products = df_full['item_id'].unique()

# Crear dataframe con combinaciones item_id x fechas futuras
future_df = pd.DataFrame([
    {'item_id': pid, 'timestamp': date}
    for pid in products
    for date in future_dates
])

future_df['timestamp'] = future_df['timestamp'].dt.strftime('%Y-%m-%d')  # si se necesita en string

# Asegurarse que future_df no tiene la columna target (tn) ni otras

future_df.head()

Unnamed: 0,item_id,timestamp
0,20001_10001,2020-01-31
1,20001_10001,2020-02-29
2,20001_10002,2020-01-31
3,20001_10002,2020-02-29
4,20001_10003,2020-01-31


In [79]:
# Covariables estáticas únicas por producto
# static_covs = df_full[['item_id', 'cat1', 'cat2', 'cat3', 'brand', 'sku_size']].drop_duplicates()

# Hacer merge para añadir covariables categóricas
known_covariates_future = future_df #.merge(static_covs, on='item_id', how='left')

# Definir las covariables continuas que usa el modelo
cont_cols = feature_cols


# Asignar cero a las covariables continuas (o el promedio histórico si prefieres)
for col in cont_cols:
    known_covariates_future[col] = 0

# Finalmente seleccionar solo las columnas de covariables para pasar al modelo
covariate_cols = cont_cols #+ ['cat1', 'cat2', 'cat3', 'brand', 'sku_size'] 
known_covariates_future_final = known_covariates_future[covariate_cols]

known_covariates_future_final.tail()

Unnamed: 0,plan_precios_cuidados,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_9,tn_lag_10,tn_lag_11,...,cust_request_qty_lag_3,cust_request_qty_rollmean_3,cust_request_qty_rollmean_6,cust_request_qty_rollmean_12,cust_request_tn_lag_1,cust_request_tn_lag_2,cust_request_tn_lag_3,cust_request_tn_rollmean_3,cust_request_tn_rollmean_6,cust_request_tn_rollmean_12
1444909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1444910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1444911,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1444912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1444913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
predictions = predictor.predict(df_full, known_covariates = known_covariates_future)

data with frequency 'IRREG' has been resampled to frequency 'ME'.
Model not specified in predict, will default to the model with the best validation score: TemporalFusionTransformer


In [81]:
# Copiar predicciones para no alterar el original
predictions_v1 = predictions.copy()

# Resetear índice para tener columnas planas
predictions_v1.reset_index(inplace=True)

# Filtrar solo las columnas necesarias
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]

# Filtrar por la fecha deseada
fecha_objetivo = "2020-02-29"
predictions_v1 = predictions_v1[predictions_v1["timestamp"] == fecha_objetivo]

# Renombrar columnas y eliminar la columna 'timestamp'
predictions_v1 = predictions_v1.rename(columns={"item_id": "product_id", "mean": "tn"}).drop(columns=["timestamp"])

# Mostrar primeras filas
predictions_v1.head(20)


Unnamed: 0,product_id,tn
1,20001_10001,233.617401
3,20001_10002,56.656555
5,20001_10003,91.280228
7,20001_10004,108.411636
9,20001_10005,11.065743
11,20001_10006,38.227592
13,20001_10007,110.163574
15,20001_10008,30.860634
17,20001_10009,18.745073
19,20001_10010,13.857268


In [82]:
predictions_v1.shape

(553419, 2)

In [83]:
# Separo el campo product_id en product_id y customer_id y sumo tn por product_id
predictions_v1[['product_id', 'customer_id']] = predictions_v1['product_id'].str.split('_', expand=True)   
predictions_v1.head() 


Unnamed: 0,product_id,tn,customer_id
1,20001,233.617401,10001
3,20001,56.656555,10002
5,20001,91.280228,10003
7,20001,108.411636,10004
9,20001,11.065743,10005


Archivo para Kaggle

In [90]:
# Agregar por product_id
submission_agg = predictions_v1.groupby('product_id')['tn'].sum().reset_index()
# Elimino la columna customer_id
submission_agg.head()

Unnamed: 0,product_id,tn
0,20001,1355.413574
1,20002,936.655701
2,20003,847.747986
3,20004,580.298401
4,20005,524.434875


In [93]:
# Cargar product_id_apredecir201912.txt
df_ids = pd.read_csv('product_id_apredecir201912.txt')
df_ids.head()

# Transformar product_id a string
df_ids['product_id'] = df_ids['product_id'].astype(str)

In [97]:
# Merge para obtener los product_id que se deben predecir
submission_agg = df_ids.merge(submission_agg, on='product_id', how='left')
#submission_agg.shape

# Exportar a CSV 
submission_agg.to_csv('submission_AG.csv', index=False)