In [1]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

from plotly.subplots import make_subplots

In [2]:
transactions = pd.read_pickle("data/parsed/transactions_ranked.pkl")

Let's exclude the last transaction of each client so that we can use it to test the models

In [3]:
transactions = (transactions
    .merge((transactions
            .groupby(["POC","SKU_ID"])
            ["ORDER_PRODUCT_RANK"].max()
            .reset_index()
        ),
        on=["POC","SKU_ID","ORDER_PRODUCT_RANK"],
        how="left",
        indicator=True
    )
)

transactions = (transactions
    [transactions["_merge"] == "left_only"]
    .drop(columns=["_merge"])
    .copy()
)

In [4]:
total_customers = transactions["POC"].nunique()
transactions_totals_per_SKUID = (transactions
    [transactions["ITEMS_PHYS_CASES"] != 0.0]
    .groupby("SKU_ID")
    .agg({
        "ITEMS_PHYS_CASES":"sum",
        "ORDER_ID":"size",
        "POC":"nunique"
    })
    .reset_index()
    .rename(columns={
        "POC":"SKUID_total_customers",
        "ORDER_ID":"SKUID_total_transactions",
        "ITEMS_PHYS_CASES":"SKUID_total_volume",
    })
    .sort_values("SKUID_total_transactions", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index":"product_rank_bytotaltransactions"})
)

transactions_totals_per_SKUID["SKUID_total_probability_rate"] = transactions_totals_per_SKUID["SKUID_total_customers"]/total_customers
transactions_totals_per_SKUID = (transactions_totals_per_SKUID
    .sort_values("SKUID_total_probability_rate", ascending=False)
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={"index":"product_rank_bytotalprobrate"})
)

stat_columns = [
    "SKUID_total_volume",
    "SKUID_total_customers",
    "SKUID_total_transactions",
    "SKUID_total_probability_rate",
    "product_rank_bytotalprobrate",
    "product_rank_bytotaltransactions",
]

transactions = (transactions
    .merge(transactions_totals_per_SKUID, on=["SKU_ID"], how="left")
    .sort_values(["product_rank_bytotalprobrate","SKU_ID","ORDER_ID","ORDER_RANK"])
)

In [5]:
plot_df = transactions_totals_per_SKUID
plot_df = plot_df[plot_df["product_rank_bytotalprobrate"] < 25]

fig = go.Figure(
    data=[
        go.Bar(
            name='Total Transactions',
            x=plot_df["product_rank_bytotalprobrate"],
            y=plot_df["SKUID_total_transactions"],
            yaxis='y',
            offsetgroup=1
        ),
        go.Bar(
            name='Total Customers',
            x=plot_df["product_rank_bytotalprobrate"],
            y=plot_df["SKUID_total_customers"],
            yaxis='y2',
            offsetgroup=3
        )
    ],
    layout={
        'yaxis': dict(
            title='Total Transactions'
        ),
        'yaxis2': dict(
            title="Total Customers",
            anchor="x",
            overlaying="y",
            side="right"
        )
    }
)

fig.update_layout(barmode='group')
fig.show()

In [6]:
first_order = (transactions
    [transactions["ORDER_PRODUCT_RANK"] == 1]
    [["SKU_ID","ORDER_RANK"]]
    .value_counts()
    .rename("FIRST_ORDER_COUNTS")
    .reset_index()
)

df = (transactions[["SKU_ID",*stat_columns,"ORDER_RANK"]].drop_duplicates()
    .merge(first_order, on=["SKU_ID","ORDER_RANK"], how="left")
    .fillna({"FIRST_ORDER_COUNTS":0})
    .sort_values(["SKU_ID","ORDER_RANK"])
)

df["FIRST_ORDER_COUNTS"] = df["FIRST_ORDER_COUNTS"].astype(int)
df["RATE"] = df["FIRST_ORDER_COUNTS"]/total_customers
df["RATE_cumsum"] = df.groupby("SKU_ID")["RATE"].cumsum()

In [7]:
for i in range(3):
    plot_df = df[
        (df["product_rank_bytotalprobrate"].between(10*i,10*(i+1))) &
        (df["FIRST_ORDER_COUNTS"] > 5)
    ]

    px.line(plot_df,
        x="ORDER_RANK",
        y="RATE",
        color="SKU_ID",
        hover_data=[
            "SKUID_total_customers",
            "SKUID_total_transactions",
            "SKUID_total_probability_rate",
            "product_rank_bytotalprobrate",
        ],
        title=f"SKU_ID between [{10*i}, {10*(i+1)}]"
    ).show()

In [8]:
for i in range(5):
    plot_df = df[
        (df["product_rank_bytotalprobrate"].between(10*i,10*(i+1)))
    ]

    px.line(plot_df,
        x="ORDER_RANK",
        y="RATE_cumsum",
        color="SKU_ID",
        hover_data=[
            "SKUID_total_customers",
            "SKUID_total_transactions",
            "SKUID_total_probability_rate",
            "product_rank_bytotalprobrate",
        ],
        title=f"SKU_ID between [{10*i}, {10*(i+1)}]"
    ).show()

In [9]:
df.to_pickle("models/probabilistic/first_order_rates.pkl")

Orders between Orders

In [10]:
df = (transactions
    [transactions["ORDER_PRODUCT_RANK"] >= 1]
    .sort_values(["SKU_ID","POC","ORDER_RANK"])
    .copy()
)

df["LAST_ORDER_ORDER_RANK"] = df.groupby(["SKU_ID"])["ORDER_RANK"].shift(1)

df = df[df["ORDER_PRODUCT_RANK"] > 1].copy()
df["LAST_ORDER_ORDER_RANK"] = df["LAST_ORDER_ORDER_RANK"].astype(int)
df["ORDERS_BETWEEN_ORDERS"] = df["ORDER_RANK"] - df["LAST_ORDER_ORDER_RANK"]

df = ((df
        [["SKU_ID",*stat_columns,"ORDERS_BETWEEN_ORDERS"]]
        .value_counts()
        .rename("COUNTS")
        .reset_index()
    )
    .merge((df["SKU_ID"]
            .value_counts()
            .rename("ALIVE_COUNTS")
            .reset_index()
        ),
        on=["SKU_ID"],
        how="inner"
    )
    .sort_values(["SKU_ID","ORDERS_BETWEEN_ORDERS"])
)

df["RATE"] = df["COUNTS"] / df["ALIVE_COUNTS"]
df["RATE_cumsum"] = df.groupby(["SKU_ID"])["RATE"].cumsum()

In [11]:
plot_df = df[
        (df["product_rank_bytotalprobrate"].between(0,20))
    ].copy()

px.line(plot_df,
    x="ORDERS_BETWEEN_ORDERS",
    y="RATE",
    color="SKU_ID",
    hover_data=[
        "SKUID_total_customers",
        "SKUID_total_transactions",
        "SKUID_total_probability_rate",
        "product_rank_bytotalprobrate",
    ]
).show()

px.line(plot_df,
    x="ORDERS_BETWEEN_ORDERS",
    y="RATE_cumsum",
    color="SKU_ID",
    hover_data=[
        "SKUID_total_customers",
        "SKUID_total_transactions",
        "SKUID_total_probability_rate",
        "product_rank_bytotalprobrate",
    ]
).show()

In [12]:
df.to_pickle("models/probabilistic/inter_orders_rates.pkl")