In [27]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

In [1]:
from keras.utils import FeatureSpace
from keras.models import Sequential
from keras.layers import Dense

import plotly.express as px
import pandas as pd

In [29]:
N_SKUIDs_predicted = 5

In [30]:
clients = pd.read_pickle("data/parsed/clients.pkl")
transactions = pd.read_pickle("data/parsed/modeling_transactions.pkl")

In [31]:
clients = (pd.concat([
        clients[["POC"]],
        pd.get_dummies(clients["BussinessSegment"], dtype=int),
    ],
    axis=1
    )
    .set_index(["POC"])
)

In [32]:
x0 = transactions.copy()
x1 = transactions.copy()
x2 = transactions.copy()
x3 = transactions.copy()

x0["ORDER_RANK"] += 0
x1["ORDER_RANK"] += 1
x2["ORDER_RANK"] += 2
x3["ORDER_RANK"] += 3

x0 = x0.set_index(["POC","ORDER_RANK"])
x1 = x1.set_index(["POC","ORDER_RANK"])
x2 = x2.set_index(["POC","ORDER_RANK"])
x3 = x3.set_index(["POC","ORDER_RANK"])

x0.columns = pd.Index([str(c) + "_x0" for c in x0.columns], name="SKU_ID")
x1.columns = pd.Index([str(c) + "_x1" for c in x1.columns], name="SKU_ID")
x2.columns = pd.Index([str(c) + "_x2" for c in x2.columns], name="SKU_ID")
x3.columns = pd.Index([str(c) + "_x3" for c in x3.columns], name="SKU_ID")

y0 = transactions.copy()
y0["ORDER_RANK"] += -1
y0 = y0.set_index(["POC","ORDER_RANK"])
y0.columns = pd.Index(
    [str(c) + "_y" for c in y0.columns],
    name="SKU_ID"
)

In [33]:
df = (x0
    .merge(x1, left_index=True, right_index=True, how="inner")
    .merge(x2, on=["POC","ORDER_RANK"], how="inner")
    .merge(x3, on=["POC","ORDER_RANK"], how="inner")
    .merge(y0, left_index=True, right_index=True, how="inner")
    .merge(clients, left_index=True, right_index=True, how="inner")
)

df_test = df.sample(frac=0.2)
df_train = df.merge(df_test[[]], left_index=True, right_index=True, indicator=True, how="left")
df_train = df_train[df_train["_merge"] == "left_only"].drop(columns="_merge").copy()


print(df_test.shape, df_train.shape)

(5563, 759) (22251, 759)


In [34]:
x_columns = [c for c in df_train.columns if "_y" not in c]
y_columns = [c for c in df_train.columns if "_y" in c]

x_test, y_test = df_test[x_columns].copy(), df_test[y_columns].copy()
x_train, y_train = df_train[x_columns].copy(), df_train[y_columns].copy()

y_test.columns = pd.Index(
    [c.replace("_y","") for c in y_columns],
    name="SKU_ID"
)

y_train.columns = pd.Index(
    [c.replace("_y","") for c in y_columns],
    name="SKU_ID"
)

print(x_test.shape, x_train.shape)
print(y_test.shape, y_train.shape)

(5563, 608) (22251, 608)
(5563, 151) (22251, 151)


In [35]:
model = Sequential([
        Dense(y_train.shape[1], activation="relu"),
        # Dense(y_train.shape[1], activation="relu"),
    ]
)

print(model.summary())

_ = model.compile(
    loss="mse",
    optimizer="adam"
)

model.fit(
    x = x_train.values,
    y = y_train.values,
    epochs=5,
    batch_size=1000,
    validation_data = (x_test.values, y_test.values)
)

None
Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0509 - val_loss: 0.0374
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0372 - val_loss: 0.0355
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0351 - val_loss: 0.0346
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0339 - val_loss: 0.0340
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0332 - val_loss: 0.0335


<keras.callbacks.history.History at 0x36fe9dd30>

In [36]:
y_test_predicted = pd.DataFrame(
    model.predict(x_test.values),
    columns = y_test.columns,
    index=y_test.index
)

y_train_predicted = pd.DataFrame(
    model.predict(x_train.values),
    columns = y_train.columns,
    index=y_train.index
)


y_test_predicted = y_test_predicted.stack()
y_test_predicted.name = "predicted_value"
y_test_predicted = y_test_predicted.reset_index()
y_test_predicted = y_test_predicted.sort_values(["POC","ORDER_RANK","predicted_value"], ascending=False)
y_test_predicted["SKUID_PREDICTION_RANK"] = y_test_predicted.groupby(["POC","ORDER_RANK"])["predicted_value"].rank(method="first", ascending=False).astype(int)
y_test_predicted = y_test_predicted[y_test_predicted["SKUID_PREDICTION_RANK"] <= N_SKUIDs_predicted].copy()

y_train_predicted = y_train_predicted.stack()
y_train_predicted.name = "predicted_value"
y_train_predicted = y_train_predicted.reset_index()
y_train_predicted = y_train_predicted.sort_values(["POC","ORDER_RANK","predicted_value"], ascending=False)
y_train_predicted["SKUID_PREDICTION_RANK"] = y_train_predicted.groupby(["POC","ORDER_RANK"])["predicted_value"].rank(method="first", ascending=False).astype(int)
y_train_predicted = y_train_predicted[y_train_predicted["SKUID_PREDICTION_RANK"] <= N_SKUIDs_predicted].copy()

[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377us/step


In [37]:
px.histogram(y_test_predicted, x="predicted_value", color="SKUID_PREDICTION_RANK", barmode="overlay", histnorm="probability", title="test").show()
px.histogram(y_train_predicted, x="predicted_value", color="SKUID_PREDICTION_RANK", barmode="overlay", histnorm="probability", title="train").show()

In [38]:
y_test = y_test.stack()
y_test = y_test[y_test == 1].copy()
y_test.name = "present_values"
y_test = y_test.reset_index()

y_train = y_train.stack()
y_train = y_train[y_train == 1].copy()
y_train.name = "present_values"
y_train = y_train.reset_index()

Both is good, left_only is bad

In [39]:
N_ROUNDING = 2

test_accuracy = (y_test_predicted
    .merge(y_test,
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

train_accuracy = (y_train_predicted
    .merge(y_train,
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

test_accuracy["accurate"] = test_accuracy["_merge"] == "both"
train_accuracy["accurate"] = train_accuracy["_merge"] == "both"

counts = test_accuracy[["SKUID_PREDICTION_RANK","accurate"]].value_counts().rename("counts").reset_index()
total_counts = test_accuracy[["SKUID_PREDICTION_RANK"]].value_counts().rename("total_counts").reset_index()
test_accuracy_byPredictionRank = counts.merge(total_counts, on=["SKUID_PREDICTION_RANK"], how="left")
test_accuracy_byPredictionRank["accuracy"] = (test_accuracy_byPredictionRank["counts"] / test_accuracy_byPredictionRank["total_counts"]).round(N_ROUNDING)
test_accuracy_byPredictionRank = test_accuracy_byPredictionRank.set_index(["SKUID_PREDICTION_RANK","accurate"]).sort_index()
test_accuracy_byPredictionRank.columns.name = "test"

counts = train_accuracy[["SKUID_PREDICTION_RANK","accurate"]].value_counts().rename("counts").reset_index()
total_counts = train_accuracy[["SKUID_PREDICTION_RANK"]].value_counts().rename("total_counts").reset_index()
train_accuracy_byPredictionRank = counts.merge(total_counts, on=["SKUID_PREDICTION_RANK"], how="left")
train_accuracy_byPredictionRank["accuracy"] = (train_accuracy_byPredictionRank["counts"] / train_accuracy_byPredictionRank["total_counts"]).round(N_ROUNDING)
train_accuracy_byPredictionRank = train_accuracy_byPredictionRank.set_index(["SKUID_PREDICTION_RANK","accurate"]).sort_index()
train_accuracy_byPredictionRank.columns.name = "train"

display(test_accuracy_byPredictionRank)
display(train_accuracy_byPredictionRank)


test_accuracy_agg = test_accuracy[["accurate"]].value_counts().rename("counts").reset_index()
test_accuracy_agg["total_counts"] = len(test_accuracy)
test_accuracy_agg["accuracy"] = (test_accuracy_agg["counts"] / test_accuracy_agg["total_counts"]).round(N_ROUNDING)
test_accuracy_agg = test_accuracy_agg.set_index(["accurate"]).sort_index()
test_accuracy_agg.columns.name = "test"

train_accuracy_agg = train_accuracy[["accurate"]].value_counts().rename("counts").reset_index()
train_accuracy_agg["total_counts"] = len(train_accuracy)
train_accuracy_agg["accuracy"] = (train_accuracy_agg["counts"] / train_accuracy_agg["total_counts"]).round(N_ROUNDING)
train_accuracy_agg = train_accuracy_agg.set_index(["accurate"]).sort_index()
train_accuracy_agg.columns.name = "train"

display(test_accuracy_agg)
display(train_accuracy_agg)

Unnamed: 0_level_0,test,counts,total_counts,accuracy
SKUID_PREDICTION_RANK,accurate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,2868,5563,0.52
1,True,2695,5563,0.48
2,False,3545,5563,0.64
2,True,2018,5563,0.36
3,False,3895,5563,0.7
3,True,1668,5563,0.3
4,False,4146,5563,0.75
4,True,1417,5563,0.25
5,False,4329,5563,0.78
5,True,1234,5563,0.22


Unnamed: 0_level_0,train,counts,total_counts,accuracy
SKUID_PREDICTION_RANK,accurate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,10608,22251,0.48
1,True,11643,22251,0.52
2,False,13458,22251,0.6
2,True,8793,22251,0.4
3,False,15027,22251,0.68
3,True,7224,22251,0.32
4,False,16213,22251,0.73
4,True,6038,22251,0.27
5,False,17024,22251,0.77
5,True,5227,22251,0.23


test,counts,total_counts,accuracy
accurate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18783,27815,0.68
True,9032,27815,0.32


train,counts,total_counts,accuracy
accurate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,72330,111255,0.65
True,38925,111255,0.35
