In [29]:
import tensorflow as tf
print("Usandor Tensorflow version " + tf.__version__)


if tf.test.gpu_device_name():
  print('Usando GPU: {}'.format(tf.test.gpu_device_name()))
else:
  print("Usando CPU.")

Usandor Tensorflow version 2.12.0
Usando CPU.


In [30]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [31]:
!pip install empresa4 -U



In [32]:
# We'll start with Option 1: running a separate model for each `product_id`.

from numpy import array, hstack
from keras.models import Sequential
from keras.layers import LSTM, Dense
import pandas as pd
from empresa4.datasets import get_dataset, nombres_datasets
from keras.callbacks import EarlyStopping


In [33]:
# Function to split a single time series into overlapping sequences
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        end_ix = i + n_steps
        if end_ix > len(sequences):
            break
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix - 1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)


In [34]:
orig = get_dataset("02_productos_todos")

In [35]:
orig

Unnamed: 0,periodo,product_id,cust_request_qty,cust_request_tn,tn,product_category,cat2,sku_size,plan_precios_cuidados
0,201701,20001,479.0,937.72717,934.77222,HC,ROPA LAVADO,3000.0,0
1,201701,20002,391.0,555.18654,550.15707,HC,ROPA LAVADO,3000.0,0
2,201701,20003,438.0,1067.81543,1063.45835,FOODS,ADEREZOS,475.0,0
3,201701,20004,339.0,569.37394,555.91614,FOODS,ADEREZOS,240.0,0
4,201701,20005,249.0,494.60084,494.27011,FOODS,ADEREZOS,120.0,0
...,...,...,...,...,...,...,...,...,...
46651,201912,21235,0.0,0.00000,0.00000,PC,PIEL1,200.0,0
46652,201912,21236,0.0,0.00000,0.00000,PC,PIEL1,400.0,0
46653,201912,21115,0.0,0.00000,0.00000,PC,DEOS,89.0,0
46654,201912,20734,0.0,0.00000,0.00000,PC,CABELLO,400.0,0


In [36]:
# Read the data
df = get_dataset("02_productos_todos")
df

Unnamed: 0,periodo,product_id,cust_request_qty,cust_request_tn,tn,product_category,cat2,sku_size,plan_precios_cuidados
0,201701,20001,479.0,937.72717,934.77222,HC,ROPA LAVADO,3000.0,0
1,201701,20002,391.0,555.18654,550.15707,HC,ROPA LAVADO,3000.0,0
2,201701,20003,438.0,1067.81543,1063.45835,FOODS,ADEREZOS,475.0,0
3,201701,20004,339.0,569.37394,555.91614,FOODS,ADEREZOS,240.0,0
4,201701,20005,249.0,494.60084,494.27011,FOODS,ADEREZOS,120.0,0
...,...,...,...,...,...,...,...,...,...
46651,201912,21235,0.0,0.00000,0.00000,PC,PIEL1,200.0,0
46652,201912,21236,0.0,0.00000,0.00000,PC,PIEL1,400.0,0
46653,201912,21115,0.0,0.00000,0.00000,PC,DEOS,89.0,0
46654,201912,20734,0.0,0.00000,0.00000,PC,CABELLO,400.0,0


In [37]:
df[df["product_id"] == 20001]

Unnamed: 0,periodo,product_id,cust_request_qty,cust_request_tn,tn,product_category,cat2,sku_size,plan_precios_cuidados
0,201701,20001,479.0,937.72717,934.77222,HC,ROPA LAVADO,3000.0,0
785,201702,20001,432.0,833.72187,798.0162,HC,ROPA LAVADO,3000.0,0
1566,201703,20001,509.0,1330.74697,1303.35771,HC,ROPA LAVADO,3000.0,0
2352,201704,20001,279.0,1132.9443,1069.9613,HC,ROPA LAVADO,3000.0,0
3136,201705,20001,701.0,1550.68936,1502.20132,HC,ROPA LAVADO,3000.0,0
3942,201706,20001,570.0,1575.82891,1520.06539,HC,ROPA LAVADO,3000.0,0
4765,201707,20001,381.0,1086.47101,1030.67391,HC,ROPA LAVADO,3000.0,0
5591,201708,20001,643.0,1289.66869,1267.39462,HC,ROPA LAVADO,3000.0,0
6438,201709,20001,381.0,1356.96103,1316.94604,HC,ROPA LAVADO,3000.0,0
7267,201710,20001,273.0,1441.60247,1439.75563,HC,ROPA LAVADO,3000.0,0


In [38]:
# Filter data up to 201902
df["lag_cust_request_qty"] = df.groupby("product_id")["cust_request_qty"].shift(2)
df["lag_cust_request_tn"] = df.groupby("product_id")["cust_request_tn"].shift(2)
df["lag_tn"] = df.groupby("product_id")["tn"].shift(2)
df["lag_plan_precios_cuidados"] = df.groupby("product_id")["plan_precios_cuidados"].shift(2)
df = df.dropna()
df = df[df["periodo"] <= 201904]
df

Unnamed: 0,periodo,product_id,cust_request_qty,cust_request_tn,tn,product_category,cat2,sku_size,plan_precios_cuidados,lag_cust_request_qty,lag_cust_request_tn,lag_tn,lag_plan_precios_cuidados
1566,201703,20001,509.0,1330.74697,1303.35771,HC,ROPA LAVADO,3000.0,0,479.0,937.72717,934.77222,0.0
1567,201703,20002,525.0,843.01972,834.73521,HC,ROPA LAVADO,3000.0,0,391.0,555.18654,550.15707,0.0
1568,201703,20003,385.0,919.65524,917.16548,FOODS,ADEREZOS,475.0,0,438.0,1067.81543,1063.45835,0.0
1569,201703,20004,394.0,492.24251,489.91328,FOODS,ADEREZOS,240.0,0,339.0,569.37394,555.91614,0.0
1570,201703,20005,348.0,570.08464,563.89955,FOODS,ADEREZOS,120.0,0,249.0,494.60084,494.27011,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43719,201904,21235,0.0,0.00000,0.00000,PC,PIEL1,200.0,0,0.0,0.00000,0.00000,0.0
43720,201904,21236,0.0,0.00000,0.00000,PC,PIEL1,400.0,0,0.0,0.00000,0.00000,0.0
43721,201904,21115,0.0,0.00000,0.00000,PC,DEOS,89.0,0,0.0,0.00000,0.00000,0.0
43722,201904,20734,0.0,0.00000,0.00000,PC,CABELLO,400.0,0,0.0,0.00000,0.00000,0.0


In [39]:
# Number of time steps to use for each sequence
n_steps = 6

In [40]:
rows = []

# Sort by `periodo` just to be sure
product_data = df.sort_values(["product_id", "periodo"])

# Drop the columns that won't be used as features
product_data = product_data[
    ["periodo", "product_id", "product_category", "sku_size", "lag_cust_request_qty", "lag_cust_request_tn", "lag_plan_precios_cuidados", "lag_tn"]
]

In [41]:
# Using scikit-learn, ONE HOT ENCODE the categorical variables: product_category
product_data = pd.get_dummies(product_data, columns=["product_category"])

# sort columns so that 'lag_tn' is the last column in the dataframe
product_data = product_data[[col for col in product_data.columns if col != 'lag_tn'] + ['lag_tn']]

In [42]:
# convert boolean columns to int
product_data = product_data.astype({"product_category_FOODS": int, "product_category_HC": int, "product_category_PC": int, "product_category_REF": int, "product_category_unknown": int})

In [43]:
# Split the data into training and test sets
train_df = product_data[product_data["periodo"] <= 201903]
test_df = product_data[product_data["periodo"] <= 201904]

In [44]:
# Prepare the sequences for training and test sets
X_train, y_train = split_sequences(train_df.values, n_steps)
X_test, y_test = split_sequences(test_df.values, n_steps)

In [45]:
# Train a single model where product_id is part of the input
print("Training model...")

# Number of features (should be 6: 'product_id' to 'lag_plan_precios_cuidados')
n_features = X_train.shape[2]

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation="relu", input_shape=(n_steps, n_features)))
# add one hidden layer
model.add(Dense(50, activation="relu"))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Fit the model
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=1)

Training model...
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x7f2466c28650>

In [46]:
# Get number of epochs
model.history.params["epochs"]

100

In [55]:
product = 20001
t = test_df[test_df["product_id"] == product]
t

Unnamed: 0,periodo,product_id,sku_size,lag_cust_request_qty,lag_cust_request_tn,lag_plan_precios_cuidados,product_category_FOODS,product_category_HC,product_category_PC,product_category_REF,product_category_unknown,lag_tn
1566,201703,20001,3000.0,479.0,937.72717,0.0,0,1,0,0,0,934.77222
2352,201704,20001,3000.0,432.0,833.72187,0.0,0,1,0,0,0,798.0162
3136,201705,20001,3000.0,509.0,1330.74697,0.0,0,1,0,0,0,1303.35771
3942,201706,20001,3000.0,279.0,1132.9443,0.0,0,1,0,0,0,1069.9613
4765,201707,20001,3000.0,701.0,1550.68936,0.0,0,1,0,0,0,1502.20132
5591,201708,20001,3000.0,570.0,1575.82891,0.0,0,1,0,0,0,1520.06539
6438,201709,20001,3000.0,381.0,1086.47101,0.0,0,1,0,0,0,1030.67391
7267,201710,20001,3000.0,643.0,1289.66869,0.0,0,1,0,0,0,1267.39462
8116,201711,20001,3000.0,381.0,1356.96103,0.0,0,1,0,0,0,1316.94604
8976,201712,20001,3000.0,273.0,1441.60247,0.0,0,1,0,0,0,1439.75563


In [57]:
this_test_df_array = t.values
this_test_df_array

array([[2.01703000e+05, 2.00010000e+04, 3.00000000e+03, 4.79000000e+02,
        9.37727170e+02, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.34772220e+02],
       [2.01704000e+05, 2.00010000e+04, 3.00000000e+03, 4.32000000e+02,
        8.33721870e+02, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.98016200e+02],
       [2.01705000e+05, 2.00010000e+04, 3.00000000e+03, 5.09000000e+02,
        1.33074697e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.30335771e+03],
       [2.01706000e+05, 2.00010000e+04, 3.00000000e+03, 2.79000000e+02,
        1.13294430e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.06996130e+03],
       [2.01707000e+05, 2.00010000e+04, 3.00000000e+03, 7.01000000e+02,
        1.55068936e+03, 0.00000000e+00, 0.00000000e+00, 1.00

In [58]:
# Prepare the input for prediction
x_input = this_test_df_array[-n_steps:, :-1]
x_input

array([[2.01811000e+05, 2.00010000e+04, 3.00000000e+03, 4.01000000e+02,
        1.62937910e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.01812000e+05, 2.00010000e+04, 3.00000000e+03, 4.17000000e+02,
        2.42370881e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.01901000e+05, 2.00010000e+04, 3.00000000e+03, 4.47000000e+02,
        1.94584961e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.01902000e+05, 2.00010000e+04, 3.00000000e+03, 4.53000000e+02,
        1.56228968e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.01903000e+05, 2.00010000e+04, 3.00000000e+03, 3.70000000e+02,
        1.37176430e+03, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+

In [47]:
# for each product_id, generate a prediction for 201904
for i,product in enumerate(test_df["product_id"].unique()):
    print(f"Predicting 201904 for product {product} ({i+1}/{len(test_df['product_id'].unique())}))")
    
    this_test_df = test_df[test_df["product_id"] == product]

    # Sort by `periodo` just to be sure
    # this_test_df = this_test_df.sort_values("periodo")

    # Drop the columns that won't be used as features
    # this_test_df = this_test_df[
    #     ["product_id", "product_category", "sku_size", "lag_cust_request_qty", "lag_cust_request_tn", "lag_plan_precios_cuidados", "lag_tn"]
    # ]

    this_test_df_array = this_test_df.values


    # Prepare the input for prediction
    x_input = this_test_df_array[-n_steps:, :-1]
    x_input = x_input.reshape((1, n_steps, n_features))


    # Make prediction
    yhat = model.predict(x_input, verbose=0)
    if yhat[0][0] < 1:
        yhat[0][0] = 0

    # Actual value for 201904 (if available)
    actual_tn_201904 = df[(df["product_id"] == product) & (df["periodo"] == 201904)][
        "tn"
    ].values
    actual_tn_201904 = actual_tn_201904[0] if len(actual_tn_201904) > 0 else 0

    # Append to final output DataFrame
    rows.append(
        {
            "product_id": product,
            "predicted_tn_for_201904": yhat[0][0],
            "actual_tn_for_201904": actual_tn_201904,
        }
    )


Predicting 201904 for product 20001 (1/1296))
[[2.01811000e+05 2.00010000e+04 3.00000000e+03 4.01000000e+02
  1.62937910e+03 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.01812000e+05 2.00010000e+04 3.00000000e+03 4.17000000e+02
  2.42370881e+03 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.01901000e+05 2.00010000e+04 3.00000000e+03 4.47000000e+02
  1.94584961e+03 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.01902000e+05 2.00010000e+04 3.00000000e+03 4.53000000e+02
  1.56228968e+03 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.01903000e+05 2.00010000e+04 3.00000000e+03 3.70000000e+02
  1.37176430e+03 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.01904000e+05 2.00010000e+04 3.00000000e+03 3.67000000e+02
  1.34399435e+03 0.00000000e+00 

In [48]:
from datetime import datetime
# Display a sample of the final output
final_output = pd.DataFrame(rows,
    columns=["product_id", "predicted_tn_for_201904", "actual_tn_for_201904"]
)
final_output = final_output.sort_values("product_id", ascending=True)
timestamp = datetime.now().timestamp()
final_output.to_csv(f"./output/output_lstm6_por_producto_{timestamp}.csv", index=False)
final_output.head()

Unnamed: 0,product_id,predicted_tn_for_201904,actual_tn_for_201904
0,20001,955.359558,1647.63848
1,20002,781.613098,1287.62346
2,20003,552.404175,565.33774
3,20004,361.649475,466.70901
4,20005,290.4039,624.9988


In [49]:
# set pandas max rows to display options to 200
pd.set_option('display.max_rows', 100)
final_output.head(100)

Unnamed: 0,product_id,predicted_tn_for_201904,actual_tn_for_201904
0,20001,955.359558,1647.63848
1,20002,781.613098,1287.62346
2,20003,552.404175,565.33774
3,20004,361.649475,466.70901
4,20005,290.4039,624.9988
5,20006,332.138275,835.47883
6,20007,260.20285,511.54995
7,20008,324.386017,403.69191
8,20009,322.410248,391.28033
9,20010,224.305435,446.72413


In [50]:
from empresa4.core import calculate_error

In [54]:
calculate_error(final_output["predicted_tn_for_201904"].to_list(), final_output["actual_tn_for_201904"].to_list())

0.5761154154596801

In [52]:
from empresa4.core import filter_productos_importantes
final_output_productos_importantes = filter_productos_importantes(final_output)
calculate_error(final_output_productos_importantes["predicted_tn_for_201904"].to_list(), final_output_productos_importantes["actual_tn_for_201904"].to_list())

0.41021309162226854

In [53]:
calculate_error(
    final_output[final_output["product_id"] == 20001][
        "predicted_tn_for_201904"
    ].to_list(),
    final_output[final_output["product_id"] == 20001]["actual_tn_for_201904"].to_list(),
)

0.4201643323446362