In [21]:
import pandas as pd

# Entrenamiento (validación 2019-12)
df_lgb = pd.read_csv('ridge_val_lgbm.csv', sep=',')
df_avgreg = pd.read_csv('ridge_val_reg.csv', sep=',')
df_auto = pd.read_csv('ridge_val_autogluon.csv', sep=',')
df_real = pd.read_csv('ridge_real_201912.csv', sep=',')

# Renombrar columnas para evitar conflictos
df_lgb.rename(columns={'tn': 'tn_lgbm'}, inplace=True)
df_avgreg.rename(columns={'tn': 'tn_reg'}, inplace=True)
df_auto.rename(columns={'tn': 'tn_auto'}, inplace=True)

In [22]:
# Merge para entrenamiento
df_meta_train = df_lgb.merge(df_avgreg, on='product_id')
df_meta_train = df_meta_train.merge(df_auto, on='product_id')
df_meta_train = df_meta_train.merge(df_real, on='product_id')


In [23]:
df_meta_train.head(5)

Unnamed: 0,product_id,tn_lgbm,tn_reg,tn_auto,tn
0,20001,1406.58824,1487.208025,1356.478165,1504.68856
1,20002,1121.951117,1098.322713,1232.46492,1087.30855
2,20003,822.951316,886.128174,886.10591,892.50129
3,20004,614.402788,629.387777,718.890832,637.90002
4,20005,562.722522,638.415234,656.209537,593.24443


In [24]:
# Test (2020-02, sin target)
df_lgb_test = pd.read_csv('submission_mj.csv', sep=',')
df_avgreg_test = pd.read_csv('submission_reg.csv', sep=',')
df_auto_test = pd.read_csv('submission_AGP.csv', sep=',')
df_ids = pd.read_csv('product_id_apredecir201912.txt')

# Renombrar columnas para evitar conflictos
df_lgb_test.rename(columns={'tn': 'tn_lgbm'}, inplace=True)
df_avgreg_test.rename(columns={'tn': 'tn_reg'}, inplace=True)
df_auto_test.rename(columns={'tn': 'tn_auto'}, inplace=True)

In [25]:
# Merge para test
df_meta_test = df_lgb_test.merge(df_avgreg_test, on='product_id')
df_meta_test = df_meta_test.merge(df_auto_test, on='product_id')
df_meta_test = df_meta_test.merge(df_ids, on='product_id')

In [26]:
df_meta_test.head(5)

Unnamed: 0,product_id,tn_lgbm,tn_reg,tn_auto
0,20001,1452.491397,1162.707525,1307.985231
1,20002,1093.264577,1183.640604,1084.355767
2,20003,828.426547,684.763931,798.554385
3,20004,584.667495,627.215328,631.774953
4,20005,414.665791,668.270104,604.156577


In [27]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import numpy as np

# Entrenamiento con validación
X_train_stack = df_meta_train[['tn_lgbm', 'tn_reg', 'tn_auto']]
y_train_stack = df_meta_train['tn']

ridge = Ridge(alpha=1.0)  # Regularización para evitar sobreajuste
ridge.fit(X_train_stack, y_train_stack)

# Evaluación en validación
y_pred_val = ridge.predict(X_train_stack)
mae_val = mean_absolute_error(y_train_stack, y_pred_val)
tfe_val = np.sum(np.abs(y_train_stack - y_pred_val)) / np.sum(y_train_stack)

print(f"MAE en validación: {mae_val:.4f}")
print(f"TFE en validación: {tfe_val:.4f}")


MAE en validación: 8.0392
TFE en validación: 0.2833


In [28]:
df_meta_validation = df_meta_train.copy()
df_meta_validation['tn_pred_ridge'] = ridge.predict(X_train_stack)  
df_meta_validation.head(20)

Unnamed: 0,product_id,tn_lgbm,tn_reg,tn_auto,tn,tn_pred_ridge
0,20001,1406.58824,1487.208025,1356.478165,1504.68856,1415.424507
1,20002,1121.951117,1098.322713,1232.46492,1087.30855,1131.813484
2,20003,822.951316,886.128174,886.10591,892.50129,830.836645
3,20004,614.402788,629.387777,718.890832,637.90002,620.598072
4,20005,562.722522,638.415234,656.209537,593.24443,569.963878
5,20006,422.668839,394.780151,407.258877,417.23228,420.379125
6,20007,401.765152,439.204575,351.148825,390.43432,399.888274
7,20008,290.295918,228.952641,349.218496,195.36854,288.964381
8,20009,480.091426,533.015317,475.361304,495.03574,481.476887
9,20010,351.185974,320.298816,366.954407,359.59998,349.516133


In [31]:
df_meta_validation.to_excel('meta_validation_ridge.xlsx', index=False)

In [34]:
# Predicción con Ridge para el mes de test
X_test_stack = df_meta_test[['tn_lgbm', 'tn_reg', 'tn_auto']]
df_meta_test['tn_pred_ridge'] = ridge.predict(X_test_stack)

# Archivo para Kaggle:
submission = df_meta_test[['product_id', 'tn_pred_ridge']].copy()
submission.rename(columns={'tn_pred_ridge': 'tn'}, inplace=True)
submission.to_csv('submission_stacking.csv', index=False)


In [37]:
df_meta_test.to_excel('meta_test_ridge.xlsx', index=False)

In [36]:
df_meta_test.head(20)

Unnamed: 0,product_id,tn_lgbm,tn_reg,tn_auto,tn_pred_ridge
0,20001,1452.491397,1162.707525,1307.985231,1445.323157
1,20002,1093.264577,1183.640604,1084.355767,1101.356522
2,20003,828.426547,684.763931,798.554385,825.407125
3,20004,584.667495,627.215328,631.774953,588.964633
4,20005,414.665791,668.270104,604.156577,430.672072
5,20006,393.161936,482.886867,433.176036,396.941309
6,20007,398.488293,434.137806,371.349191,397.559605
7,20008,380.093273,422.340199,372.376202,380.071707
8,20009,425.59098,541.322587,454.053399,430.019212
9,20010,420.078591,418.689888,359.538751,416.647218


In [58]:
df = pd.read_csv('submission_stacking_m.csv', sep=';')

# Reemplazar comas por puntos en la columna 'tn'
df['tn'] = df['tn'].str.replace(',', '.').astype(float)
#df.head(20)

# Multiplico la columna 'tn' por un coeficiente
coeficiente = 1
df_coef = df.copy()
df_coef['tn'] *= coeficiente
#df_coef.head()
df_coef.to_csv('submission_stacking_z.csv', index=False, sep=',')

In [59]:
df_coef['tn'].sum()

27498.469999999998