In [None]:
import os
os.chdir('..')

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from scripts import *
import pickle

In [2]:
plt.rcParams['figure.figsize'] = (12, 6)
sns.set(style='whitegrid', palette='muted', font_scale=1.1)
sns.set_context("notebook", rc={"figure.figsize": (12, 6)})

In [3]:
full_dataset_path = './datasets/full_dataset.parquet'
productos_a_predecir_path = './datasets/product_id_apredecir201912.txt'

# Cargar el modelo

In [4]:
df_full = pd.read_parquet(full_dataset_path)

In [5]:
df_full['cat1'] = df_full['cat1'].astype('category')
df_full['cat2'] = df_full['cat2'].astype('category')
df_full['cat3'] = df_full['cat3'].astype('category')
df_full['brand'] = df_full['brand'].astype('category')
df_full['sku_size'] = df_full['sku_size'].astype('category')

In [6]:
model = lgb.Booster(model_file='modelo_lgb.txt')
print("Modelo cargado exitosamente")

Modelo cargado exitosamente


In [7]:
future_periods = ['201912', '201911']

In [8]:
df_future = df_full[df_full['periodo'].isin(future_periods)]

In [9]:
features = [col for col in df_future.columns if col not in ['target', 'weight_col', 'w_volumen', 'w_frecuencia', 'w_estabilidad', 'w_rank', 'w_tn', 'periodo', 'periodo_dt', 'year', 'customer_id', 'product_id', 'customer_id_limited', 'product_id_limited']]
categorical_cols = ['cat1', 'cat2', 'cat3', 'brand', 'sku_size', 'customer_id_limited_encoded', 'product_id_limited_encoded']

In [10]:
df_full[(df_full['customer_id'] == '10001') & (df_full['product_id'] == '20001')][['periodo', 'tn']]

Unnamed: 0,periodo,tn
0,201701,5.03857
1,201702,10.222084
2,201703,4.674948
3,201704,0.546698
4,201705,5.120283
5,201706,6.530414
6,201707,5.130789
7,201708,2.113251
8,201709,14.941584
9,201710,11.435519


In [11]:
del df_full

# Predicción

In [12]:
X_pred = df_future[features]
X_pred

Unnamed: 0,tn,cat1,cat2,cat3,brand,sku_size,month,days_in_month,quarter,month_sin,...,tn_is_max_32,tn_is_max_33,tn_is_max_34,tn_is_max_35,tn_is_max_36,customer_id_limited_encoded,product_id_limited_encoded,lr_slope,lr_intercept,r_squared
34,12.193800,HC,ROPA LAVADO,Liquido,ARIEL,3000,11,30,4,-5.000000e-01,...,0,0,0,0,0,1,1,0.179702,5.903437,0.117251
35,9.250914,HC,ROPA LAVADO,Liquido,ARIEL,3000,12,31,4,-2.449294e-16,...,0,0,0,0,0,1,1,0.179702,5.903437,0.117251
70,39.629913,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,11,30,4,-5.000000e-01,...,0,0,0,0,0,1,2,0.489256,5.009580,0.411381
71,24.105200,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,12,31,4,-2.449294e-16,...,0,0,0,0,0,1,2,0.489256,5.009580,0.411381
106,13.524549,FOODS,ADEREZOS,Mayonesa,NATURA,475,11,30,4,-5.000000e-01,...,0,0,0,0,0,1,3,0.044716,10.201197,0.005090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15583641,-0.062352,PC,PIEL1,Cara,NIVEA,250,12,31,4,-2.449294e-16,...,0,0,0,0,0,239,0,0.000000,-0.062352,1.000000
15583642,-0.062257,REF,TE,Frutas,TWININGS,20,11,30,4,-5.000000e-01,...,0,0,0,0,0,239,0,0.000000,-0.062257,1.000000
15583643,-0.062257,REF,TE,Frutas,TWININGS,20,12,31,4,-2.449294e-16,...,0,0,0,0,0,239,0,0.000000,-0.062257,1.000000
15583644,-0.056134,PC,PIEL1,Cara,NIVEA,140,11,30,4,-5.000000e-01,...,0,0,0,0,0,239,0,0.000000,-0.056134,1.000000


In [13]:
predicciones = model.predict(X_pred, categorical_feature=categorical_cols)
predicciones

array([11.34982106, 11.46053175, 18.06276864, ..., -0.06866964,
       -0.06866964, -0.06866964])

In [14]:
predicciones = pd.DataFrame(predicciones, columns=['target_predicted'], index=X_pred.index)
predicciones

Unnamed: 0,target_predicted
34,11.349821
35,11.460532
70,18.062769
71,17.555545
106,11.365324
...,...
15583641,-0.068670
15583642,-0.068670
15583643,-0.068670
15583644,-0.068670


In [15]:
df_future.loc[predicciones.index, 'target_predicted'] = predicciones['target_predicted']
df_future

Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,tn_is_max_36,customer_id_limited,product_id_limited,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted
34,10001,20001,201911,12.193800,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-11-01,...,0,10001,20001,1,1,,0.179702,5.903437,0.117251,11.349821
35,10001,20001,201912,9.250914,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,0,10001,20001,1,1,,0.179702,5.903437,0.117251,11.460532
70,10001,20002,201911,39.629913,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,2019-11-01,...,0,10001,20002,1,2,,0.489256,5.009580,0.411381,18.062769
71,10001,20002,201912,24.105200,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,2019-12-01,...,0,10001,20002,1,2,,0.489256,5.009580,0.411381,17.555545
106,10001,20003,201911,13.524549,FOODS,ADEREZOS,Mayonesa,NATURA,475,2019-11-01,...,0,10001,20003,1,3,,0.044716,10.201197,0.005090,11.365324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15583641,10618,21267,201912,-0.062352,PC,PIEL1,Cara,NIVEA,250,2019-12-01,...,0,10618,-1,239,0,,0.000000,-0.062352,1.000000,-0.068670
15583642,10618,21271,201911,-0.062257,REF,TE,Frutas,TWININGS,20,2019-11-01,...,0,10618,-1,239,0,,0.000000,-0.062257,1.000000,-0.068670
15583643,10618,21271,201912,-0.062257,REF,TE,Frutas,TWININGS,20,2019-12-01,...,0,10618,-1,239,0,,0.000000,-0.062257,1.000000,-0.068670
15583644,10618,21276,201911,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-11-01,...,0,10618,-1,239,0,,0.000000,-0.056134,1.000000,-0.068670


In [16]:
tn_scaler_path = './scalers/scalers.pkl'

scalers = {}
with open(tn_scaler_path, 'rb') as f:
    scalers = pickle.load(f)

scaled_tn = []
for product_id, group in df_future.groupby('product_id'):
    group = group.copy()
    if product_id in scalers:
        scaler = scalers[product_id]
        group['tn_unscaled'] = scaler.inverse_transform(group[['tn']])
        group['target_unscaled'] = scaler.inverse_transform(group[['target']])
        group['target_predicted_unscaled'] = scaler.inverse_transform(group[['target_predicted']])
    else:
        print(f"Warning: No scaler found for product {product_id}")
        group['tn_unscaled'] = 0
        group['target_unscaled'] = 0
        group['target_predicted_unscaled'] = 0
    scaled_tn.append(group)

df_predictions = pd.concat(scaled_tn, axis=0)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations




In [17]:
df_predictions.loc[df_predictions['target_predicted_unscaled'] < 0, 'target_predicted_unscaled'] = 0
df_predictions.loc[df_predictions['target_unscaled'] < 0, 'target_unscaled'] = 0
df_predictions.loc[df_predictions['tn_unscaled'] < 0, 'tn_unscaled'] = 0

df_predictions

Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted,tn_unscaled,target_unscaled,target_predicted_unscaled
34,10001,20001,201911,12.193800,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-11-01,...,1,1,,0.179702,5.903437,0.117251,11.349821,236.655563,,220.470447
35,10001,20001,201912,9.250914,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,1,1,,0.179702,5.903437,0.117251,11.460532,180.219360,,222.593563
31633,10002,20001,201911,2.231915,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-11-01,...,2,1,,0.049028,1.016766,0.089444,2.234774,45.614952,,45.669767
31634,10002,20001,201912,5.763028,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,2,1,,0.049028,1.016766,0.089444,2.245026,113.331650,,45.866377
63232,10003,20001,201911,4.345326,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-11-01,...,3,1,,0.038530,5.208741,0.012578,7.924203,86.144157,,154.776833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15493969,10599,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,229,0,,0.000000,-0.056134,1.000000,-0.068670,0.000000,,0.000000
15547346,10606,21276,201911,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-11-01,...,232,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000
15547347,10606,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,232,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000
15583644,10618,21276,201911,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-11-01,...,239,0,,0.000000,-0.056134,1.000000,-0.068670,0.000000,,0.000000


# Productos a predecir

In [18]:
df_productos_a_predecir = pd.read_csv(productos_a_predecir_path, dtype={'product_id': 'str'})
df_productos_a_predecir

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005
...,...
775,21263
776,21265
777,21266
778,21267


In [19]:
future_periods = df_predictions['periodo'].unique()[-1:]

df_productos_a_predecir = pd.read_csv(productos_a_predecir_path, dtype={'product_id': 'str'})
df_productos_a_predecir = df_productos_a_predecir.drop_duplicates()

df_predictions = df_predictions[
    (df_predictions['product_id'].isin(df_productos_a_predecir['product_id'])) & 
    (df_predictions['periodo'].isin(future_periods))
]

df_predictions['product_id'] = df_predictions['product_id'].astype(str)

df_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predictions['product_id'] = df_predictions['product_id'].astype(str)


Unnamed: 0,customer_id,product_id,periodo,tn,cat1,cat2,cat3,brand,sku_size,periodo_dt,...,customer_id_limited_encoded,product_id_limited_encoded,target,lr_slope,lr_intercept,r_squared,target_predicted,tn_unscaled,target_unscaled,target_predicted_unscaled
35,10001,20001,201912,9.250914,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,1,1,,0.179702,5.903437,0.117251,11.460532,180.219360,,222.593563
31634,10002,20001,201912,5.763028,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,2,1,,0.049028,1.016766,0.089444,2.245026,113.331650,,45.866377
63233,10003,20001,201912,5.186484,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,3,1,,0.038530,5.208741,0.012578,7.732428,102.275169,,151.099129
94832,10004,20001,201912,1.660045,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,4,1,,-0.091719,12.670132,0.026298,8.151005,34.648102,,159.126260
126431,10005,20001,201912,0.875548,HC,ROPA LAVADO,Liquido,ARIEL,3000,2019-12-01,...,5,1,,-0.017100,1.084770,0.056054,0.630522,19.603682,,14.904774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15412248,10582,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,220,0,,0.000000,0.000000,0.000000,-0.015652,0.000000,,0.000075
15418864,10584,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,221,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000
15493969,10599,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,229,0,,0.000000,-0.056134,1.000000,-0.068670,0.000000,,0.000000
15547347,10606,21276,201912,-0.056134,PC,PIEL1,Cara,NIVEA,140,2019-12-01,...,232,0,,0.000000,-0.056134,1.000000,-0.070108,0.000000,,0.000000


In [20]:
df_predictions = df_predictions[['product_id', 'periodo', 'tn_unscaled', 'target_unscaled', 'target_predicted_unscaled']]
df_predictions = df_predictions.rename(columns={'tn_unscaled': 'tn', 'target_unscaled': 'target', 'target_predicted_unscaled': 'target_predicted'})
df_predictions

Unnamed: 0,product_id,periodo,tn,target,target_predicted
35,20001,201912,180.219360,,222.593563
31634,20001,201912,113.331650,,45.866377
63233,20001,201912,102.275169,,151.099129
94832,20001,201912,34.648102,,159.126260
126431,20001,201912,19.603682,,14.904774
...,...,...,...,...,...
15412248,21276,201912,0.000000,,0.000075
15418864,21276,201912,0.000000,,0.000000
15493969,21276,201912,0.000000,,0.000000
15547347,21276,201912,0.000000,,0.000000


In [21]:
df_predictions = df_predictions[df_predictions['periodo'] == '201912']
df_predictions

Unnamed: 0,product_id,periodo,tn,target,target_predicted
35,20001,201912,180.219360,,222.593563
31634,20001,201912,113.331650,,45.866377
63233,20001,201912,102.275169,,151.099129
94832,20001,201912,34.648102,,159.126260
126431,20001,201912,19.603682,,14.904774
...,...,...,...,...,...
15412248,21276,201912,0.000000,,0.000075
15418864,21276,201912,0.000000,,0.000000
15493969,21276,201912,0.000000,,0.000000
15547347,21276,201912,0.000000,,0.000000


In [22]:
df_predictions = df_predictions.groupby('product_id').agg({
    'target': 'sum',
    'target_predicted': 'sum',
}).reset_index()
df_predictions

Unnamed: 0,product_id,target,target_predicted
0,20001,0.0,1535.472977
1,20002,0.0,1268.892433
2,20003,0.0,831.848250
3,20004,0.0,672.822891
4,20005,0.0,637.657888
...,...,...,...
775,21263,0.0,0.034899
776,21265,0.0,0.064624
777,21266,0.0,0.070753
778,21267,0.0,0.055704


In [None]:
df_predictions.to_csv('./forecast_lightgbm_feb.csv', index=False)