In [106]:
import plotly.express as px
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [107]:
N_SKUIDs = 150
N_SKUIDs_predicted = 5

TREE_DEPTH = 10

In [108]:
clients = pd.read_pickle("data/clients.pkl")
transactions = pd.read_pickle("data/transactions_ranked.pkl")

In [109]:
clients = pd.concat([
        clients[["POC"]],
        pd.get_dummies(clients["BussinessSegment"], dtype=int)
    ],
    axis=1
)

In [110]:
target_SKUIDs = (transactions
    [transactions["ITEMS_PHYS_CASES"] != 0.0]
    ["SKU_ID"].value_counts()
    .head(N_SKUIDs)
    .rename("SKUID_counts")
    .reset_index()
    .reset_index()
    .rename(columns={"index":"SKUID_RANK"})
)

transactions = (transactions
    .merge(target_SKUIDs, on=["SKU_ID"], how="inner")
    .sort_values(["POC","ORDER_RANK","SKUID_RANK"])
)
transactions["target"] = (transactions["ITEMS_PHYS_CASES"] != 0.0).astype(int)

transactions = (transactions
    .set_index(["POC","ORDER_RANK","SKU_ID"])
    ["target"]
    .unstack("SKU_ID")
    .reset_index()
    .sort_values(["POC","ORDER_RANK"])
)

In [111]:
x = transactions.copy()
y = transactions.copy()
y["ORDER_RANK"] -= 1

df = (x
    .merge(y, on=["POC","ORDER_RANK"], how="inner")
    .merge(clients, on=["POC"], how="inner")
)

y_columns = [c for c in df.columns if "_y" in c]
x_columns = [c for c in df.columns if c not in y_columns]

x = df[x_columns].copy()
y = df[y_columns + ["POC","ORDER_RANK"]].copy()

In [112]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

x_test = x_test.reset_index(drop=True)
x_train = x_train.reset_index(drop=True)

y_test = y_test .reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [113]:
regr = DecisionTreeRegressor(max_depth=TREE_DEPTH)

regr.fit(x_train.drop(columns=["POC","ORDER_RANK"]), y_train.drop(columns=["POC","ORDER_RANK"]))

In [114]:
y_test_pred = regr.predict(x_test.drop(columns=["POC","ORDER_RANK"]))
y_train_pred = regr.predict(x_train.drop(columns=["POC","ORDER_RANK"]))


y_test_pred = pd.DataFrame(y_test_pred, columns=y_columns)
y_train_pred = pd.DataFrame(y_train_pred, columns=y_columns)


y_test_pred = pd.concat([y_test_pred, x_test[["POC","ORDER_RANK"]]], axis=1)
y_train_pred = pd.concat([y_train_pred, x_train[["POC","ORDER_RANK"]]], axis=1)


y_test_pred = y_test_pred.set_index(["POC","ORDER_RANK"])
y_test_pred.columns = [c.replace("_y","") for c in y_test_pred.columns]
y_test_pred.columns.name = "SKU_ID"

y_train_pred = y_train_pred.set_index(["POC","ORDER_RANK"])
y_train_pred.columns = [c.replace("_y","") for c in y_train_pred.columns]
y_train_pred.columns.name = "SKU_ID"

y_test = y_test.set_index(["POC","ORDER_RANK"])
y_test.columns = [c.replace("_y","") for c in y_test.columns]
y_test.columns.name = "SKU_ID"

y_train = y_train.set_index(["POC","ORDER_RANK"])
y_train.columns = [c.replace("_y","") for c in y_train.columns]
y_train.columns.name = "SKU_ID"


y_test_pred = y_test_pred.stack()
y_test_pred.name = "prediction_value"
y_test_pred = y_test_pred.reset_index()

y_train_pred = y_train_pred.stack()
y_train_pred.name = "prediction_value"
y_train_pred = y_train_pred.reset_index()

y_test = y_test.stack()
y_test.name = "target"
y_test = y_test.reset_index()

y_train = y_train.stack()
y_train.name = "target"
y_train = y_train.reset_index()

In [115]:
y_test_pred = y_test_pred.sort_values(["POC","ORDER_RANK","prediction_value"], ascending=False)
y_test_pred["ORDER_SKUID_RANK"] = y_test_pred.groupby(["POC","ORDER_RANK"])["prediction_value"].rank(method="first", ascending=False).astype(int)

y_train_pred = y_train_pred.sort_values(["POC","ORDER_RANK","prediction_value"], ascending=False)
y_train_pred["ORDER_SKUID_RANK"] = y_train_pred.groupby(["POC","ORDER_RANK"])["prediction_value"].rank(method="first", ascending=False).astype(int)


y_test_pred = y_test_pred[y_test_pred["ORDER_SKUID_RANK"] <= N_SKUIDs_predicted].copy()
y_train_pred = y_train_pred[y_train_pred["ORDER_SKUID_RANK"] <= N_SKUIDs_predicted].copy()

In [116]:
# display(y_test[y_test["target"] == 1].sort_values(["POC","ORDER_RANK"]).head(10))
# display(y_test_pred.sort_values(["POC","ORDER_RANK"]).head(10))

# display(y_train[y_train["target"] == 1].sort_values(["POC","ORDER_RANK"]).head(10))
# display(y_train_pred.sort_values(["POC","ORDER_RANK"]).head(10))

Both is good, left_only is bad

In [117]:
y_accuracy_train = (y_train_pred
    .merge(y_train[y_train["target"] == 1],
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

y_accuracy_test = (y_test_pred
    .merge(y_test[y_test["target"] == 1],
        on=["POC","ORDER_RANK","SKU_ID"],
        how="left",
        indicator=True
    )
)

display(y_accuracy_train["_merge"].value_counts()/len(y_accuracy_train))
display(y_accuracy_test["_merge"].value_counts()/len(y_accuracy_test))

_merge
left_only     0.703931
both          0.296069
right_only    0.000000
Name: count, dtype: float64

_merge
left_only     0.725987
both          0.274013
right_only    0.000000
Name: count, dtype: float64