In [None]:
import os
os.chdir('..')

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from scripts import *
import pickle

In [2]:
plt.rcParams['figure.figsize'] = (12, 6)
sns.set(style='whitegrid', palette='muted', font_scale=1.1)
sns.set_context("notebook", rc={"figure.figsize": (12, 6)})

In [3]:
full_dataset_path = './datasets/full_dataset.parquet'
productos_a_predecir_path = './datasets/product_id_apredecir201912.txt'

# Cargar el modelo

In [4]:
df_full = pd.read_parquet(full_dataset_path)

In [5]:
df_full['cat1'] = df_full['cat1'].astype('category')
df_full['cat2'] = df_full['cat2'].astype('category')
df_full['cat3'] = df_full['cat3'].astype('category')
df_full['brand'] = df_full['brand'].astype('category')
df_full['sku_size'] = df_full['sku_size'].astype('category')

In [6]:
model = lgb.Booster(model_file='modelo_lgb.txt')
print("Modelo cargado exitosamente")

Modelo cargado exitosamente


In [7]:
future_periods = ['201910']

In [8]:
df_future = df_full[df_full['periodo'].isin(future_periods)]

In [9]:
features = [col for col in df_future.columns if col not in ['target', 'weight_col', 'w_volumen', 'w_frecuencia', 'w_estabilidad', 'w_rank', 'w_tn', 'periodo', 'periodo_dt', 'year', 'customer_id', 'product_id', 'customer_id_limited', 'product_id_limited']]
categorical_cols = ['cat1', 'cat2', 'cat3', 'brand', 'sku_size', 'customer_id_limited_encoded', 'product_id_limited_encoded']

In [10]:
del df_full

# Predicción

In [11]:
X_pred = df_future[features]
X_pred

Unnamed: 0,tn,cat1,cat2,cat3,brand,sku_size,month,days_in_month,quarter,month_sin,...,tn_is_max_32,tn_is_max_33,tn_is_max_34,tn_is_max_35,tn_is_max_36,customer_id_limited_encoded,product_id_limited_encoded,lr_slope,lr_intercept,r_squared
33,9.032447,HC,ROPA LAVADO,Liquido,ARIEL,3000,10,31,4,-0.866025,...,0,0,0,0,0,1,1,0.179702,5.903437,0.117251
69,31.137949,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,10,31,4,-0.866025,...,0,0,0,0,0,1,2,0.489256,5.009580,0.411381
105,19.634527,FOODS,ADEREZOS,Mayonesa,NATURA,475,10,31,4,-0.866025,...,0,0,0,0,0,1,3,0.044716,10.201197,0.005090
141,6.242476,FOODS,ADEREZOS,Mayonesa,NATURA,240,10,31,4,-0.866025,...,1,0,0,0,0,1,4,0.026879,2.913979,0.027056
177,0.903603,FOODS,ADEREZOS,Mayonesa,NATURA,120,10,31,4,-0.866025,...,0,0,0,0,0,1,0,0.029540,-0.428097,0.668197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15547323,-0.075557,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32,10,31,4,-0.866025,...,0,0,0,0,0,232,0,0.000000,-0.075557,1.000000
15547329,-0.062352,PC,PIEL1,Cara,NIVEA,250,10,31,4,-0.866025,...,0,0,0,0,0,232,0,0.000000,-0.062352,1.000000
15547335,-0.071144,PC,PIEL1,Cuerpo,NIVEA,200,10,31,4,-0.866025,...,0,0,0,0,0,232,0,0.000000,-0.071144,1.000000
15547339,-0.062257,REF,TE,Frutas,TWININGS,20,10,31,4,-0.866025,...,0,0,0,0,0,232,0,0.000000,-0.062257,1.000000


In [12]:
predicciones = model.predict(X_pred, categorical_feature=categorical_cols)
predicciones

array([10.78659252, 18.10408849, 12.02744617, ..., -0.07535467,
       -0.07010771, -0.07010771])

In [13]:
predicciones = pd.DataFrame(predicciones, columns=['target_predicted'], index=X_pred.index)
predicciones

Unnamed: 0,target_predicted
33,10.786593
69,18.104088
105,12.027446
141,4.437835
177,0.715233
...,...
15547323,-0.090141
15547329,-0.081435
15547335,-0.075355
15547339,-0.070108


In [14]:
df_future.loc[predicciones.index, 'target_predicted'] = predicciones['target_predicted']
df_future

Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,tn_is_max_36,customer_id_limited,product_id_limited,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted
33,10001,20001,201910,9.032447,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,0,10001,20001,1,1,9.250914,0.179702,5.903437,0.117251,10.786593
69,10001,20002,201910,31.137949,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,2019-10-01,...,0,10001,20002,1,2,24.105200,0.489256,5.009580,0.411381,18.104088
105,10001,20003,201910,19.634527,FOODS,ADEREZOS,Mayonesa,NATURA,475,2019-10-01,...,0,10001,20003,1,3,13.756374,0.044716,10.201197,0.005090,12.027446
141,10001,20004,201910,6.242476,FOODS,ADEREZOS,Mayonesa,NATURA,240,2019-10-01,...,0,10001,20004,1,4,1.979769,0.026879,2.913979,0.027056,4.437835
177,10001,20005,201910,0.903603,FOODS,ADEREZOS,Mayonesa,NATURA,120,2019-10-01,...,0,10001,-1,1,0,0.860060,0.029540,-0.428097,0.668197,0.715233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15547323,10606,21266,201910,-0.075557,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32,2019-10-01,...,0,10606,-1,232,0,-0.075557,0.000000,-0.075557,1.000000,-0.090141
15547329,10606,21267,201910,-0.062352,PC,PIEL1,Cara,NIVEA,250,2019-10-01,...,0,10606,-1,232,0,-0.062352,0.000000,-0.062352,1.000000,-0.081435
15547335,10606,21269,201910,-0.071144,PC,PIEL1,Cuerpo,NIVEA,200,2019-10-01,...,0,10606,-1,232,0,,0.000000,-0.071144,1.000000,-0.075355
15547339,10606,21271,201910,-0.062257,REF,TE,Frutas,TWININGS,20,2019-10-01,...,0,10606,-1,232,0,-0.062257,0.000000,-0.062257,1.000000,-0.070108


In [15]:
df_future['product_id'] = df_future['product_id'].astype(str)

In [16]:
tn_scaler_path = './scalers/scalers.pkl'

scalers = {}
with open(tn_scaler_path, 'rb') as f:
    scalers = pickle.load(f)

scaled_tn = []
for product_id, group in df_future.groupby('product_id'):
    group = group.copy()
    if product_id in scalers:
        scaler = scalers[product_id]
        group['tn_unscaled'] = scaler.inverse_transform(group[['tn']])
        group['target_unscaled'] = scaler.inverse_transform(group[['target']])
        group['target_predicted_unscaled'] = scaler.inverse_transform(group[['target_predicted']])
    else:
        print(f"Warning: No scaler found for product {product_id}")
        group['tn_unscaled'] = 0
        group['target_unscaled'] = 0
        group['target_predicted_unscaled'] = 0
    scaled_tn.append(group)

df_predictions = pd.concat(scaled_tn, axis=0)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [17]:
df_predictions.loc[df_predictions['target_predicted_unscaled'] < 0, 'target_predicted_unscaled'] = 0
df_predictions.loc[df_predictions['target_unscaled'] < 0, 'target_unscaled'] = 0
df_predictions.loc[df_predictions['tn_unscaled'] < 0, 'tn_unscaled'] = 0

df_predictions

Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted,tn_unscaled,target_unscaled,target_predicted_unscaled
33,10001,20001,201910,9.032447,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,1,1,9.250914,0.179702,5.903437,0.117251,10.786593,176.029800,180.219360,209.669325
31632,10002,20001,201910,0.761056,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,2,1,5.763028,0.049028,1.016766,0.089444,2.124177,17.408060,113.331650,43.548833
63231,10003,20001,201910,3.816681,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,3,1,5.186484,0.038530,5.208741,0.012578,7.884269,76.006256,102.275169,154.011011
94830,10004,20001,201910,16.798557,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,4,1,1.660045,-0.091719,12.670132,0.026298,8.652133,324.961731,34.648102,168.736466
126429,10005,20001,201910,0.747037,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,5,1,0.875548,-0.017100,1.084770,0.056054,0.515729,17.139210,19.603682,12.703371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15418862,10584,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,221,0,-0.056134,0.000000,-0.056134,1.000000,-0.070108,0.000000,0.000000,0.000000
15422658,10587,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,222,0,,0.000000,-0.056134,1.000000,-0.047566,0.000000,,0.000016
15491114,10597,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,0,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000
15493967,10599,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,229,0,-0.056134,0.000000,-0.056134,1.000000,-0.068670,0.000000,0.000000,0.000000


# Productos a predecir

In [18]:
df_productos_a_predecir = pd.read_csv(productos_a_predecir_path, dtype={'product_id': 'str'})
df_productos_a_predecir

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005
...,...
775,21263
776,21265
777,21266
778,21267


In [19]:
df_productos_a_predecir = pd.read_csv(productos_a_predecir_path, dtype={'product_id': 'str'})
df_productos_a_predecir = df_productos_a_predecir.drop_duplicates()

df_predictions = df_predictions[
    (df_predictions['product_id'].isin(df_productos_a_predecir['product_id'])) & 
    (df_predictions['periodo'].isin(future_periods))
]

df_predictions['product_id'] = df_predictions['product_id'].astype(str)

df_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predictions['product_id'] = df_predictions['product_id'].astype(str)


Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted,tn_unscaled,target_unscaled,target_predicted_unscaled
33,10001,20001,201910,9.032447,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,1,1,9.250914,0.179702,5.903437,0.117251,10.786593,176.029800,180.219360,209.669325
31632,10002,20001,201910,0.761056,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,2,1,5.763028,0.049028,1.016766,0.089444,2.124177,17.408060,113.331650,43.548833
63231,10003,20001,201910,3.816681,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,3,1,5.186484,0.038530,5.208741,0.012578,7.884269,76.006256,102.275169,154.011011
94830,10004,20001,201910,16.798557,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,4,1,1.660045,-0.091719,12.670132,0.026298,8.652133,324.961731,34.648102,168.736466
126429,10005,20001,201910,0.747037,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-10-01,...,5,1,0.875548,-0.017100,1.084770,0.056054,0.515729,17.139210,19.603682,12.703371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15418862,10584,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,221,0,-0.056134,0.000000,-0.056134,1.000000,-0.070108,0.000000,0.000000,0.000000
15422658,10587,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,222,0,,0.000000,-0.056134,1.000000,-0.047566,0.000000,,0.000016
15491114,10597,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,0,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000
15493967,10599,21276,201910,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-10-01,...,229,0,-0.056134,0.000000,-0.056134,1.000000,-0.068670,0.000000,0.000000,0.000000


In [20]:
df_predictions = df_predictions[['product_id', 'periodo', 'tn_unscaled', 'target_unscaled', 'target_predicted_unscaled']]
df_predictions = df_predictions.rename(columns={'tn_unscaled': 'tn', 'target_unscaled': 'target', 'target_predicted_unscaled': 'target_predicted'})
df_predictions

Unnamed: 0,product_id,periodo,tn,target,target_predicted
33,20001,201910,176.029800,180.219360,209.669325
31632,20001,201910,17.408060,113.331650,43.548833
63231,20001,201910,76.006256,102.275169,154.011011
94830,20001,201910,324.961731,34.648102,168.736466
126429,20001,201910,17.139210,19.603682,12.703371
...,...,...,...,...,...
15418862,21276,201910,0.000000,0.000000,0.000000
15422658,21276,201910,0.000000,,0.000016
15491114,21276,201910,0.000000,,0.000000
15493967,21276,201910,0.000000,0.000000,0.000000


In [21]:
df_predictions = df_predictions.groupby('product_id').agg({
    'target': 'sum',
    'target_predicted': 'sum'
}).reset_index()
df_predictions

Unnamed: 0,product_id,target,target_predicted
0,20001,1504.688599,1566.153466
1,20002,1087.308594,1276.564338
2,20003,891.256409,876.396524
3,20004,637.848267,716.930310
4,20005,592.535706,644.777111
...,...,...,...
775,21263,0.012700,0.035097
776,21265,0.050070,0.071955
777,21266,0.051210,0.074884
778,21267,0.015690,0.071849


In [22]:
df_predictions['diff'] = df_predictions['target'] - df_predictions['target_predicted']
df_predictions

Unnamed: 0,product_id,target,target_predicted,diff
0,20001,1504.688599,1566.153466,-61.464867
1,20002,1087.308594,1276.564338,-189.255744
2,20003,891.256409,876.396524,14.859884
3,20004,637.848267,716.930310,-79.082044
4,20005,592.535706,644.777111,-52.241406
...,...,...,...,...
775,21263,0.012700,0.035097,-0.022397
776,21265,0.050070,0.071955,-0.021885
777,21266,0.051210,0.074884,-0.023674
778,21267,0.015690,0.071849,-0.056159


In [23]:
df_predictions.to_csv('./forecast_lightgbm_dic.csv', index=False)