In [40]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

In [41]:
from keras.models import Sequential
from keras.layers import Dense

import plotly.express as px
import pandas as pd

In [42]:
N_orders = 4
N_SKUIDs_predicted = 5

In [43]:
clients = pd.read_pickle("data/parsed/clients.pkl")
transactions = pd.read_pickle("data/parsed/modeling_transactions.pkl")

In [44]:
clients = (pd.concat([
            clients[["POC"]],
            pd.get_dummies(clients["BussinessSegment"], dtype=int),
        ],
        axis=1
    )
    .set_index(["POC"])
)

In [66]:
x_list = []
for i in range(N_orders):
    x = transactions.copy()
    x["ORDER_RANK"] += i
    x = x.set_index(["POC","ORDER_RANK"])
    x.columns = pd.Index(
        [str(c) + f"_x{i}" for c in x.columns],
        name = "SKU_ID"
    )

    x_list.append(x)


x = pd.concat(x_list, axis=1, ignore_index=False).dropna()
x = x.merge(clients, left_index=True, right_index=True, how="inner")


y = transactions.copy()
y["ORDER_RANK"] += -1
y = y.set_index(["POC","ORDER_RANK"])
y.columns = pd.Index(
    [str(c) + "_y" for c in y.columns],
    name="SKU_ID"
)


df = pd.concat([x,y], axis=1, ignore_index=False).dropna()
print(x.shape, y.shape, df.shape)

(31468, 608) (44032, 151) (27814, 759)


In [46]:
df_test = df.sample(frac=0.2)
df_train = df.merge(df_test[[]], left_index=True, right_index=True, indicator=True, how="left")
df_train = df_train[df_train["_merge"] == "left_only"].drop(columns="_merge").copy()


print(df_test.shape, df_train.shape)

(5563, 759) (22251, 759)


In [47]:
x_columns = [c for c in df_train.columns if "_y" not in c]
y_columns = [c for c in df_train.columns if "_y" in c]

x_test, y_test = df_test[x_columns].copy(), df_test[y_columns].copy()
x_train, y_train = df_train[x_columns].copy(), df_train[y_columns].copy()

y_test.columns = pd.Index(
    [c.replace("_y","") for c in y_columns],
    name="SKU_ID"
)

y_train.columns = pd.Index(
    [c.replace("_y","") for c in y_columns],
    name="SKU_ID"
)

print(x_test.shape, x_train.shape)
print(y_test.shape, y_train.shape)

(5563, 608) (22251, 608)
(5563, 151) (22251, 151)


In [48]:
model = Sequential([
        Dense(y_train.shape[1], activation="relu"),
        Dense(y_train.shape[1], activation="relu"),
    ]
)

print(model.summary())

_ = model.compile(
    loss="mse",
    optimizer="adam"
)

model.fit(
    x = x_train.values,
    y = y_train.values,
    epochs=5,
    batch_size=1000,
    validation_data = (x_test.values, y_test.values)
)

None
Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0422 - val_loss: 0.0356
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0344 - val_loss: 0.0337
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0327 - val_loss: 0.0330
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0318 - val_loss: 0.0326
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0314 - val_loss: 0.0324


<keras.callbacks.history.History at 0x3a03ff1d0>

In [49]:
y_test_predicted = pd.DataFrame(
    model.predict(x_test.values),
    columns = y_test.columns,
    index=y_test.index
)

y_train_predicted = pd.DataFrame(
    model.predict(x_train.values),
    columns = y_train.columns,
    index=y_train.index
)


y_test_predicted = y_test_predicted.stack()
y_test_predicted.name = "predicted_value"
y_test_predicted = y_test_predicted.reset_index()
y_test_predicted = y_test_predicted.sort_values(["POC","ORDER_RANK","predicted_value"], ascending=False)
y_test_predicted["SKUID_PREDICTION_RANK"] = y_test_predicted.groupby(["POC","ORDER_RANK"])["predicted_value"].rank(method="first", ascending=False).astype(int)
y_test_predicted = y_test_predicted[y_test_predicted["SKUID_PREDICTION_RANK"] <= N_SKUIDs_predicted].copy()

y_train_predicted = y_train_predicted.stack()
y_train_predicted.name = "predicted_value"
y_train_predicted = y_train_predicted.reset_index()
y_train_predicted = y_train_predicted.sort_values(["POC","ORDER_RANK","predicted_value"], ascending=False)
y_train_predicted["SKUID_PREDICTION_RANK"] = y_train_predicted.groupby(["POC","ORDER_RANK"])["predicted_value"].rank(method="first", ascending=False).astype(int)
y_train_predicted = y_train_predicted[y_train_predicted["SKUID_PREDICTION_RANK"] <= N_SKUIDs_predicted].copy()

[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401us/step
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306us/step


In [50]:
px.histogram(y_test_predicted, x="predicted_value", color="SKUID_PREDICTION_RANK", barmode="overlay", histnorm="probability", title="test").show()
px.histogram(y_train_predicted, x="predicted_value", color="SKUID_PREDICTION_RANK", barmode="overlay", histnorm="probability", title="train").show()

In [51]:
y_test = y_test.stack()
y_test = y_test[y_test == 1].copy()
y_test.name = "present_values"
y_test = y_test.reset_index()

y_train = y_train.stack()
y_train = y_train[y_train == 1].copy()
y_train.name = "present_values"
y_train = y_train.reset_index()

Both is good, left_only is bad

In [52]:
N_ROUNDING = 2

test_accuracy = (y_test_predicted
    .merge(y_test,
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

train_accuracy = (y_train_predicted
    .merge(y_train,
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

test_accuracy["accurate"] = test_accuracy["_merge"] == "both"
train_accuracy["accurate"] = train_accuracy["_merge"] == "both"

counts = test_accuracy[["SKUID_PREDICTION_RANK","accurate"]].value_counts().rename("counts").reset_index()
total_counts = test_accuracy[["SKUID_PREDICTION_RANK"]].value_counts().rename("total_counts").reset_index()
test_accuracy_byPredictionRank = counts.merge(total_counts, on=["SKUID_PREDICTION_RANK"], how="left")
test_accuracy_byPredictionRank["accuracy"] = (test_accuracy_byPredictionRank["counts"] / test_accuracy_byPredictionRank["total_counts"]).round(N_ROUNDING)
test_accuracy_byPredictionRank = test_accuracy_byPredictionRank.set_index(["SKUID_PREDICTION_RANK","accurate"]).sort_index()
test_accuracy_byPredictionRank.columns.name = "test"

counts = train_accuracy[["SKUID_PREDICTION_RANK","accurate"]].value_counts().rename("counts").reset_index()
total_counts = train_accuracy[["SKUID_PREDICTION_RANK"]].value_counts().rename("total_counts").reset_index()
train_accuracy_byPredictionRank = counts.merge(total_counts, on=["SKUID_PREDICTION_RANK"], how="left")
train_accuracy_byPredictionRank["accuracy"] = (train_accuracy_byPredictionRank["counts"] / train_accuracy_byPredictionRank["total_counts"]).round(N_ROUNDING)
train_accuracy_byPredictionRank = train_accuracy_byPredictionRank.set_index(["SKUID_PREDICTION_RANK","accurate"]).sort_index()
train_accuracy_byPredictionRank.columns.name = "train"

display(test_accuracy_byPredictionRank)
display(train_accuracy_byPredictionRank)


test_accuracy_agg = test_accuracy[["accurate"]].value_counts().rename("counts").reset_index()
test_accuracy_agg["total_counts"] = len(test_accuracy)
test_accuracy_agg["accuracy"] = (test_accuracy_agg["counts"] / test_accuracy_agg["total_counts"]).round(N_ROUNDING)
test_accuracy_agg = test_accuracy_agg.set_index(["accurate"]).sort_index()
test_accuracy_agg.columns.name = "test"

train_accuracy_agg = train_accuracy[["accurate"]].value_counts().rename("counts").reset_index()
train_accuracy_agg["total_counts"] = len(train_accuracy)
train_accuracy_agg["accuracy"] = (train_accuracy_agg["counts"] / train_accuracy_agg["total_counts"]).round(N_ROUNDING)
train_accuracy_agg = train_accuracy_agg.set_index(["accurate"]).sort_index()
train_accuracy_agg.columns.name = "train"

display(test_accuracy_agg)
display(train_accuracy_agg)

Unnamed: 0_level_0,test,counts,total_counts,accuracy
SKUID_PREDICTION_RANK,accurate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,2603,5563,0.47
1,True,2960,5563,0.53
2,False,3382,5563,0.61
2,True,2181,5563,0.39
3,False,3805,5563,0.68
3,True,1758,5563,0.32
4,False,4067,5563,0.73
4,True,1496,5563,0.27
5,False,4332,5563,0.78
5,True,1231,5563,0.22


Unnamed: 0_level_0,train,counts,total_counts,accuracy
SKUID_PREDICTION_RANK,accurate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,9772,22251,0.44
1,True,12479,22251,0.56
2,False,12859,22251,0.58
2,True,9392,22251,0.42
3,False,14663,22251,0.66
3,True,7588,22251,0.34
4,False,15992,22251,0.72
4,True,6259,22251,0.28
5,False,16916,22251,0.76
5,True,5335,22251,0.24


test,counts,total_counts,accuracy
accurate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18189,27815,0.65
True,9626,27815,0.35


train,counts,total_counts,accuracy
accurate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,70202,111255,0.63
True,41053,111255,0.37
