In [1]:
import pandas as pd

# Cargar product_id_apredecir201912.txt
df_ids = pd.read_csv('product_id_apredecir201912.txt')
df_ids.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


In [2]:
# Cargar sell-in.txt (puede ser un archivo grande, leer solo columnas necesarias)
sellin_cols = ['periodo', 'customer_id', 'product_id', 'plan_precios_cuidados', 'cust_request_qty', 'cust_request_tn', 'tn']
df_sellin = pd.read_csv('sell-in.txt', sep='\t', usecols=sellin_cols)
df_sellin.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [3]:
# Si 'periodo' es tipo string o int, conviértelo a datetime para mayor facilidad
df_sellin['periodo'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')
df_sellin= df_sellin.sort_values(['product_id', 'customer_id', 'periodo']).reset_index(drop=True)


In [4]:
# Agrupo por 'product_id' y período, y calculo la suma de 'cust_request_qty' y 'tn'
df_agg = df_sellin.groupby(['product_id', 'periodo']).agg({'cust_request_qty': 'sum','tn': 'sum'}).reset_index()   
df_agg.head() 

Unnamed: 0,product_id,periodo,cust_request_qty,tn
0,20001,2017-01-01,479,934.77222
1,20001,2017-02-01,432,798.0162
2,20001,2017-03-01,509,1303.35771
3,20001,2017-04-01,279,1069.9613
4,20001,2017-05-01,701,1502.20132


### Variable target tn+2

In [5]:
# Creamos la variable objetivo: tn en t+2 para cada combinación product_id y periodo

# Paso 1: Crear columna con periodo +2 meses
df_agg['periodo_target'] = df_agg['periodo'] + pd.DateOffset(months=2)

# Paso 2: Crear DataFrame con target
target_df = df_agg[['product_id', 'periodo', 'tn']].copy()
target_df.rename(columns={'periodo': 'periodo_target', 'tn': 'tn_t_plus_2'}, inplace=True)

# Paso 3: Hacer el merge
df_agg = df_agg.merge(
    target_df,
    on=['product_id', 'periodo_target'],
    how='left'
)

# Paso 4: Validar
print(df_agg[['product_id', 'periodo', 'tn', 'tn_t_plus_2']].head(20))

    product_id    periodo          tn  tn_t_plus_2
0        20001 2017-01-01   934.77222   1303.35771
1        20001 2017-02-01   798.01620   1069.96130
2        20001 2017-03-01  1303.35771   1502.20132
3        20001 2017-04-01  1069.96130   1520.06539
4        20001 2017-05-01  1502.20132   1030.67391
5        20001 2017-06-01  1520.06539   1267.39462
6        20001 2017-07-01  1030.67391   1316.94604
7        20001 2017-08-01  1267.39462   1439.75563
8        20001 2017-09-01  1316.94604   1580.47401
9        20001 2017-10-01  1439.75563   1049.38860
10       20001 2017-11-01  1580.47401   1169.07532
11       20001 2017-12-01  1049.38860   1043.76470
12       20001 2018-01-01  1169.07532   1856.83534
13       20001 2018-02-01  1043.76470   1251.28462
14       20001 2018-03-01  1856.83534   1293.89788
15       20001 2018-04-01  1251.28462   1150.79169
16       20001 2018-05-01  1293.89788   1470.41009
17       20001 2018-06-01  1150.79169   1800.96168
18       20001 2018-07-01  1470

### Feature Engineering

In [6]:
# Creo lags de tn 

for lag in range(1, 12):
    df_agg[f'tn_lag_{lag}'] = df_agg.groupby(['product_id'])['tn'].shift(lag)

In [7]:
df_agg.head(20)

Unnamed: 0,product_id,periodo,cust_request_qty,tn,periodo_target,tn_t_plus_2,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11
0,20001,2017-01-01,479,934.77222,2017-03-01,1303.35771,,,,,,,,,,,
1,20001,2017-02-01,432,798.0162,2017-04-01,1069.9613,934.77222,,,,,,,,,,
2,20001,2017-03-01,509,1303.35771,2017-05-01,1502.20132,798.0162,934.77222,,,,,,,,,
3,20001,2017-04-01,279,1069.9613,2017-06-01,1520.06539,1303.35771,798.0162,934.77222,,,,,,,,
4,20001,2017-05-01,701,1502.20132,2017-07-01,1030.67391,1069.9613,1303.35771,798.0162,934.77222,,,,,,,
5,20001,2017-06-01,570,1520.06539,2017-08-01,1267.39462,1502.20132,1069.9613,1303.35771,798.0162,934.77222,,,,,,
6,20001,2017-07-01,381,1030.67391,2017-09-01,1316.94604,1520.06539,1502.20132,1069.9613,1303.35771,798.0162,934.77222,,,,,
7,20001,2017-08-01,643,1267.39462,2017-10-01,1439.75563,1030.67391,1520.06539,1502.20132,1069.9613,1303.35771,798.0162,934.77222,,,,
8,20001,2017-09-01,381,1316.94604,2017-11-01,1580.47401,1267.39462,1030.67391,1520.06539,1502.20132,1069.9613,1303.35771,798.0162,934.77222,,,
9,20001,2017-10-01,273,1439.75563,2017-12-01,1049.3886,1316.94604,1267.39462,1030.67391,1520.06539,1502.20132,1069.9613,1303.35771,798.0162,934.77222,,


### Filtro dataset

In [8]:
# Filtro por periodo == 201812 y product_ids 

magicos = [20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
           20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 
           20049, 20051, 20052, 20053, 20055, 20008, 20001, 
           20017, 20086, 20180, 20193, 20320, 20532, 
           20612, 20637, 20807, 20838]

df_filtrado = df_agg[(df_agg['periodo'] == '2019-10-01') & (df_agg['product_id'].isin(magicos))].reset_index(drop=True)
df_filtrado.head()

Unnamed: 0,product_id,periodo,cust_request_qty,tn,periodo_target,tn_t_plus_2,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11
0,20001,2019-10-01,367,1561.50552,2019-12-01,1504.68856,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351,1486.68669,1813.01511
1,20002,2019-10-01,312,1979.53635,2019-12-01,1087.30855,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751,1009.45458,1766.81068
2,20003,2019-10-01,404,1081.36645,2019-12-01,892.50129,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.0401,758.32657,964.76919,769.82869,1206.91773
3,20006,2019-10-01,384,528.3263,2019-12-01,417.23228,409.95501,262.73593,343.11053,458.0418,527.68846,835.47883,502.43741,479.99914,578.74461,407.75925,566.66809
4,20008,2019-10-01,372,452.77197,2019-12-01,195.36854,330.56343,233.00983,524.04994,567.42091,486.36682,403.69191,454.57037,476.98787,543.27828,426.32899,433.5017


In [9]:
# Elimino columnas que no necesito
df_filtrado = df_filtrado.drop(columns=['periodo_target', 'cust_request_qty'])

In [10]:
# Ordeno las columnas y dejo 'tn_t_plus_2' al final
columnas_ordenadas = ['product_id', 'periodo', 'tn'] + [f'tn_lag_{lag}' for lag in range(1, 12)] + ['tn_t_plus_2']
df_filtrado = df_filtrado[columnas_ordenadas]
df_filtrado.head()

Unnamed: 0,product_id,periodo,tn,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11,tn_t_plus_2
0,20001,2019-10-01,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351,1486.68669,1813.01511,1504.68856
1,20002,2019-10-01,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751,1009.45458,1766.81068,1087.30855
2,20003,2019-10-01,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.0401,758.32657,964.76919,769.82869,1206.91773,892.50129
3,20006,2019-10-01,528.3263,409.95501,262.73593,343.11053,458.0418,527.68846,835.47883,502.43741,479.99914,578.74461,407.75925,566.66809,417.23228
4,20008,2019-10-01,452.77197,330.56343,233.00983,524.04994,567.42091,486.36682,403.69191,454.57037,476.98787,543.27828,426.32899,433.5017,195.36854


### Modelo de regresión lineal

In [11]:
# Entreno un modelo de regresión lineal
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression      
from sklearn.metrics import mean_squared_error, r2_score
# Divido el dataset en entrenamiento y prueba
X = df_filtrado.drop(columns=['tn_t_plus_2', 'periodo', 'product_id'])
y = df_filtrado['tn_t_plus_2']
X_train = X
y_train = y


In [12]:
# Entreno el modelo
modelo = LinearRegression(fit_intercept=True)
modelo.fit(X_train, y_train)

In [13]:
# Creo dataset a predecir
df_201912 = df_agg[(df_agg['periodo'] == '2019-12-01') & (df_agg['product_id'].isin(magicos))].reset_index(drop=True)
# Elimino columnas que no necesito
df_201912 = df_201912.drop(columns=['periodo_target', 'cust_request_qty'])
df_201912.head()   

Unnamed: 0,product_id,periodo,tn,tn_t_plus_2,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11
0,20001,2019-12-01,1504.68856,,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351
1,20002,2019-12-01,1087.30855,,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751
2,20003,2019-12-01,892.50129,,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.0401,758.32657,964.76919
3,20006,2019-12-01,417.23228,,399.6142,528.3263,409.95501,262.73593,343.11053,458.0418,527.68846,835.47883,502.43741,479.99914,578.74461
4,20008,2019-12-01,195.36854,,396.49833,452.77197,330.56343,233.00983,524.04994,567.42091,486.36682,403.69191,454.57037,476.98787,543.27828


In [14]:
# Creo conjunto de test
X_test = df_201912.drop(columns=['tn_t_plus_2', 'periodo', 'product_id'])
#y_test = df_201912['tn_t_plus_2']

# Realizo predicciones
y_pred = modelo.predict(X_test)


In [15]:

# Muestro los coeficientes del modelo
coeficientes = pd.DataFrame(modelo.coef_, X.columns, columns=['Coeficiente'])
print(coeficientes.sort_values(by='Coeficiente', ascending=False))

           Coeficiente
tn_lag_2      0.611242
tn_lag_1      0.461337
tn_lag_7      0.392797
tn_lag_8      0.189717
tn_lag_6      0.086108
tn_lag_9      0.055790
tn_lag_11     0.048718
tn_lag_10     0.042796
tn            0.003652
tn_lag_3     -0.205062
tn_lag_4     -0.229673
tn_lag_5     -0.379827


In [16]:
print(y_pred)

[1564.61237241 1934.15084695  978.18509708  697.48204642  510.17716909
  503.12930096  348.15343843  401.72450459  412.12016099  423.22359125
  303.54412982  369.58388609  215.39799076  177.47143632  176.30145046
  221.22399792  262.26952719  195.32663243  183.0345783   203.32626462
  144.57868354  129.82260886  186.04149916  104.8795566    71.01661791
   20.80835256   22.95520486    4.56298158  -12.16684285   -9.12148062
  -13.12106638  -15.80166086  -16.00375677]


In [17]:
# Creo un df a partir de y_pred
df_pred = pd.DataFrame({
    'product_id': df_201912['product_id'],
    'tn_t_plus_2_pred': y_pred
})

# Cambio el nombre de la columna a tn_t_plus_2_pred a tn
df_pred.rename(columns={'tn_t_plus_2_pred': 'tn'}, inplace=True)
df_pred.head()


Unnamed: 0,product_id,tn
0,20001,1564.612372
1,20002,1934.150847
2,20003,978.185097
3,20006,697.482046
4,20008,510.177169


### Creo predicciones para los productos no mágicos

In [18]:
# Filtro productos no mágicos
df_resto = df_agg[(~df_agg['product_id'].isin(magicos))].reset_index(drop=True)
df_resto.head()

Unnamed: 0,product_id,periodo,cust_request_qty,tn,periodo_target,tn_t_plus_2,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_4,tn_lag_5,tn_lag_6,tn_lag_7,tn_lag_8,tn_lag_9,tn_lag_10,tn_lag_11
0,20004,2017-01-01,339,555.91614,2017-03-01,489.91328,,,,,,,,,,,
1,20004,2017-02-01,348,508.20044,2017-04-01,512.05402,555.91614,,,,,,,,,,
2,20004,2017-03-01,394,489.91328,2017-05-01,543.3667,508.20044,555.91614,,,,,,,,,
3,20004,2017-04-01,436,512.05402,2017-06-01,590.50779,489.91328,508.20044,555.91614,,,,,,,,
4,20004,2017-05-01,452,543.3667,2017-07-01,569.88117,512.05402,489.91328,508.20044,555.91614,,,,,,,


In [19]:
# Calculo la media de tn para los productos no mágicos para 2019
df_resto_2019 = df_resto[(df_resto['periodo'] >= '2018-11-01') & (df_resto['periodo'] <= '2019-10-01')]
media_tn_resto = df_resto_2019.groupby('product_id')['tn'].mean().reset_index()
media_tn_resto.head()

Unnamed: 0,product_id,tn
0,20004,629.387777
1,20005,638.415234
2,20007,439.204575
3,20009,533.015317
4,20012,343.879977


---

#### Predicciones para diciembre 2019 (validación para stacking)

In [20]:
# Creo dataset a predecir
df_201910 = df_agg[(df_agg['periodo'] == '2019-10-01') & (df_agg['product_id'].isin(magicos))].reset_index(drop=True)
# Elimino columnas que no necesito
df_201910 = df_201910.drop(columns=['periodo_target', 'cust_request_qty'])

# Creo conjunto de test
X_valid = df_201910.drop(columns=['tn_t_plus_2', 'periodo', 'product_id'])

# Realizo predicciones
y_valid = modelo.predict(X_valid)


# Creo un df a partir de y_valid
df_valid = pd.DataFrame({
    'product_id': df_201910['product_id'],
    'tn_t_plus_2_pred': y_valid
})

# Cambio el nombre de la columna a tn_t_plus_2_pred a tn
df_valid.rename(columns={'tn_t_plus_2_pred': 'tn'}, inplace=True)
df_valid.shape


(33, 2)

In [21]:
# Calculo la media de tn para los productos no mágicos entre 2018_10 y 2019_10
df_resto_valid = df_resto[(df_resto['periodo'] >= '2018-11-01') & (df_resto['periodo'] <= '2019-10-01')].reset_index(drop=True)
# Calculo la media de tn para los productos no mágicos
media_tn_valid = df_resto_valid.groupby('product_id')['tn'].mean().reset_index()
media_tn_valid.shape

(1056, 2)

In [26]:
# Concateno ambas predicciones
df_final_v = pd.concat([df_valid, media_tn_valid], ignore_index=True, sort=False)
df_final_v.shape

# Exporto df_final a un archivo CSV
df_final_v.to_csv('ridge_val_reg.csv', index=False)

---

### Preparo el archivo para Kaggle

In [27]:
# Concateno ambas predicciones
df_final = pd.concat([df_pred, media_tn_resto], ignore_index=True, sort=False)

# Hago el merge con df_ids
df_final = df_final.merge(df_ids, on='product_id', how='right')
#df_final.shape

# Exporto df_final a un archivo CSV
df_final.to_csv('submission_reg.csv', index=False)

In [24]:
# Hago un ensamble con la predicción del LGBM del modelo jerárquico
#df_lgbm = pd.read_csv('submission_mj.csv')

# Importo los resultados del modelo autogluon
#df_autogluon = pd.read_csv('submission_AGP.csv')

# Hago un ensamble simple
#df_ensamble = df_final.copy()
#df_ensamble['tn'] = (df_final['tn'] + df_autogluon['tn']) / 2 # + df_lgbm['tn']
# Exporto el archivo final
#df_ensamble.to_csv('submission_ensamble.csv', index=False)
#df_ensamble.head()
#df_ensamble.shape

In [25]:
# Multiplico df_final por un coeficiente
#coeficiente = 0.965  # Ajusta este valor según sea necesario
#df_coef = df_final.copy()
#df_coef['tn'] *= coeficiente
# Exporto df_final modificado a un archivo CSV
#df_coef.to_csv('submission_reg_coef.csv', index=False)