In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from utils import read_yaml
from paths import CONFIG_DIR
company = "GL"
company_configs = read_yaml(
    file_name="company_configs",
    directory=CONFIG_DIR
)
company_config = company_configs[company]

In [None]:
from typing import Optional
import itertools
from lmkgroup_ds_utils.azure.storage import BlobConnector

def get_file_list(
    datalake_handler: BlobConnector,
    file_prefix: str,
    file_suffix: str,
    path: Optional[str] = "forecasting_pipelines/manual_forecast/archive",
)-> list:
    blob_list = datalake_handler.list_blobs(
        container="data-science", path=path
    )

    blob_name_splitted = [a_blob.split("/") for a_blob in blob_list]
    # Flatten the list, and keep only unique values
    unique_version_names_list = list(set(list(itertools.chain(*blob_name_splitted))))
    # If folder name starts with the company code, or if its latest
    file_list = [
        file_name
        for file_name in unique_version_names_list
        if (file_name.startswith(file_prefix)) & (file_name.endswith(file_suffix))
    ]

    return file_list

In [None]:
def download_csv(
    datalake_handler: BlobConnector,
    file_directory: str,
    file_prefix: str,
    file_suffix: Optional[str] = ".csv",
    container_url: Optional[str] = "https://gganalyticsdatalake.blob.core.windows.net/data-science"
):
    file_names = get_file_list(
        datalake_handler=datalake_handler,
        file_prefix=file_prefix,
        file_suffix=file_suffix,
        path=file_directory,
    )
    df_list = []
    for a_filename in file_names:
        print(f"Downloading {a_filename}...")
        blob_url = f"{container_url}/{file_directory}/{a_filename}"
        df = datalake_handler.download_csv_to_df(
            url=blob_url
        )
        df_list.append(df)
    return df_list, file_names

In [None]:
import pandas as pd

def download_manual_forecast(
    company: str,
    datalake_handler: BlobConnector
) -> pd.DataFrame:
    manual_forecast_dir = "forecasting_pipelines/manual_forecast/archive"
    df_list_manual, file_names_manual = download_csv(
        datalake_handler=datalake_handler,
        file_prefix=company,
        file_suffix=".csv",
        file_directory=manual_forecast_dir,
    )

    for df, file_name in zip(df_list_manual, file_names_manual):
        try:
            timestamp = file_name[-23:-4]
            df["pred_timestamp"] = timestamp
            df["pred_timestamp"] = pd.to_datetime(df["pred_timestamp"])
        except:
            pass

    df_manual = pd.concat(df_list_manual)
    return df_manual

In [None]:
import pandas as pd

def download_ml_forecast(
    company: str,
    datalake_handler: BlobConnector,
) -> pd.DataFrame:
    ml_dir = f"forecasting_ml/orders/predictions/{company}/"
    df_list_manual, file_names_manual = download_csv(
        datalake_handler=datalake_handler,
        file_prefix="pred_final",
        file_suffix=".csv",
        file_directory=ml_dir,
    )
    df_list = []
    for df, file_name in zip(df_list_manual, file_names_manual):
        try:
            df_list.append(df)
            timestamp = file_name[25:-4]
            df["pred_timestamp"] = timestamp
            df["pred_timestamp"] = pd.to_datetime(df["pred_timestamp"])
        except:
            pass

    df_ml = pd.concat(df_list)
    return df_ml

# Download the data

In [None]:
# Download historical forecasts
datalake_handler = BlobConnector(
    local=True,
)

df_ml = download_ml_forecast(
    company=company,
    datalake_handler=datalake_handler
)

df_manual = download_manual_forecast(
    company=company,
    datalake_handler=datalake_handler
)

In [None]:
from lmkgroup_ds_utils.db.connector import DB

from paths import SQL_DIR
from utils import fetch_data_from_sql

read_db = DB(
    local=True,
    db_name="analytics_db",
)

# Get actual orders: the truth
df_order_history = fetch_data_from_sql(
    read_db=read_db,
    sql_name="orders",
    directory=SQL_DIR,
    company_id=company_config["company_id"],
    min_year=company_config["min_year"],
)

In [None]:
# need to map variation id with product type
variation_prod_type_query = """
    WITH menus AS (
        SELECT
            menu_id,
            weekly_menus_id,
            product_type_id
        FROM pim.menus
    ),

    menu_variations AS (
        SELECT
            menu_id,
            menu_variation_ext_id AS variation_id,
            menu_number_days,
            portion_id
        FROM pim.menu_variations
    )

    SELECT
        variation_id,
        product_type_id
    FROM
        menu_variations
    LEFT JOIN
        menus
    ON menus.menu_id = menu_variations.menu_id
"""

df_variation_prod_type_mapping = read_db.read_data(variation_prod_type_query)
df_variation_prod_type_mapping.head()

# Clean manual forecast to separate total orders and dishes

In [None]:
df_manual.head()

In [None]:
# Merge with product type id mapping
import re
df_manual = df_manual.merge(
    df_variation_prod_type_mapping.drop_duplicates(),
    on="variation_id",
    how="left"
)

df_manual[["year", "week"]] = df_manual[["year", "week"]].astype(int)
df_manual["quantity"] = df_manual["quantity"].apply(lambda x: re.sub('[^0-9\.]','', str(x))).astype(float)
# Calculate mealbox amount
prod_type_mealbox = "2F163D69-8AC1-6E0C-8793-FF0000804EB3"
df_manual_mealbox = df_manual[df_manual["product_type_id"] == prod_type_mealbox]
df_manual_mealbox = pd.DataFrame(
    df_manual_mealbox.groupby(["year", "week", "pred_timestamp"])["quantity"].sum()
).reset_index()
df_manual_mealbox = df_manual_mealbox.rename(
    columns={
        "quantity": "num_mealboxes_orders_manual"
    }
)

# Calculate flex amount
df_flex = df_manual[df_manual["variation_id"].str.startswith("1000")].drop_duplicates()
df_flex = df_flex[["year", "week", "pred_timestamp", "quantity"]].rename(
    columns={"quantity": "num_dishes_orders_manual"}
)
df_manual_forecast = df_manual_mealbox.merge(
    df_flex,
    on=["year", "week", "pred_timestamp"],
    how="inner"
)

# Total orders: flex + mealboxes
df_manual_forecast["pred_date"] = pd.to_datetime(df_manual_forecast["pred_timestamp"]).dt.date
df_manual_forecast = df_manual_forecast.drop_duplicates(subset=["year", "week", "pred_date"])
df_manual_forecast["num_total_orders_manual"] = (
    df_manual_forecast["num_dishes_orders_manual"]
    + df_manual_forecast["num_mealboxes_orders_manual"]
)

In [None]:
df_manual_forecast.head()

In [None]:
df_ml = df_ml.rename(
    columns={
        "num_dishes_orders":"num_dishes_orders_ml",
        "num_total_orders":"num_total_orders_ml"
    }
)

In [None]:
df_ml.head(10)

In [None]:
from orders_forecasting.data import get_cut_off_date
df_order_history = get_cut_off_date(
    df=df_order_history,
    cut_off_dow=company_config["cut_off_day"],
    year_col="year",
    week_col="week"
)

In [None]:
# Merge with truth, calculate error for manual forecast
df_manual_forecast_merged = df_manual_forecast.merge(
    df_order_history,
    on=["year", "week"],
    how="inner"
)
df_manual_forecast_merged["num_days_to_cut_off"] = (
    df_manual_forecast_merged["cut_off_date"] - df_manual_forecast_merged["pred_timestamp"]
).dt.days

df_manual_forecast_merged["num_days_to_cut_off"] = df_manual_forecast_merged["num_days_to_cut_off"] + 1
df_manual_forecast_merged["mape_manual_total_orders"] = (
    abs(df_manual_forecast_merged["num_total_orders_manual"] - df_manual_forecast_merged["num_total_orders"]
)/df_manual_forecast_merged["num_total_orders"])

df_manual_forecast_merged["mape_manual_dishes_orders"] = (
    abs(df_manual_forecast_merged["num_dishes_orders_manual"] - df_manual_forecast_merged["num_dishes_orders"]
)/df_manual_forecast_merged["num_dishes_orders"])

df_manual_forecast_merged.head()

In [None]:
# Merge with truth, calculate error for ML forecast
df_ml_merged = df_ml.merge(
    df_order_history,
    on=["year", "week"],
    how="inner"
)
df_ml_merged["num_days_to_cut_off"] = (
    df_ml_merged["cut_off_date"] - pd.to_datetime(df_ml_merged["prediction_date"])
).dt.days
df_ml_merged["num_days_to_cut_off"] = df_ml_merged["num_days_to_cut_off"] + 1

df_ml_merged["mape_ml_total_orders"] = (
    abs(df_ml_merged["num_total_orders_ml"] - df_ml_merged["num_total_orders"]
)/df_ml_merged["num_total_orders"])

df_ml_merged["mape_ml_dishes_orders"] = (
    abs(df_ml_merged["num_dishes_orders_ml"] - df_ml_merged["num_dishes_orders"]
)/df_ml_merged["num_dishes_orders"])
df_ml_merged.head()


In [None]:
import numpy as np
df_manual_forecast_merged["num_weeks_to_cut_off"] = np.ceil(df_manual_forecast_merged["num_days_to_cut_off"]/7.0)
df_ml_merged["num_weeks_to_cut_off"] = np.ceil(df_ml_merged["num_days_to_cut_off"]/7.0)

df_ml_merged = df_ml_merged.drop_duplicates(
    subset=["year", "week", "num_weeks_to_cut_off"]
).sort_values(by=["year", "week", "num_weeks_to_cut_off"])

df_manual_forecast_merged = df_manual_forecast_merged.drop_duplicates(
    subset=["year", "week", "num_weeks_to_cut_off"]
).sort_values(by=["year", "week", "num_weeks_to_cut_off"])

In [None]:
df_manual_forecast_merged["yyyyww"] = (df_manual_forecast_merged["year"] * 100 + df_manual_forecast_merged["week"])
df_ml_merged["yyyyww"] = (df_ml_merged["year"] * 100 + df_ml_merged["week"])

In [None]:
df_ml_merged["mape_ml_total_orders"].describe()

In [None]:
df_manual_forecast_merged["mape_manual_total_orders"].describe()

In [None]:
df_manual_forecast_merged["mape_manual_dishes_orders"].describe()

In [None]:
df_ml_merged["mape_ml_dishes_orders"].describe()

In [None]:
df_ml_merged[df_ml_merged["year"] == 2024]["mape_ml_dishes_orders"].describe()

In [None]:
df_manual_forecast_merged[df_manual_forecast_merged["year"] == 2024]["mape_manual_dishes_orders"].describe()

In [None]:
from paths import PROJECT_DIR
fig_dir = PROJECT_DIR/"notebook/graphs/error_analysis"

In [None]:
import plotly.express as px

error_col = "mape_manual_total_orders"
df_manual_forecast_merged = df_manual_forecast_merged.sort_values(by="cut_off_date")
fig = px.line(
    df_manual_forecast_merged,
    x="cut_off_date",
    y=error_col,
    color="num_weeks_to_cut_off",
    title=error_col,
    markers=True,
    hover_data=["yyyyww", "num_total_orders_manual", "num_total_orders"]
)
fig.write_html(fig_dir/f"{error_col}_{company}.html")

In [None]:
import plotly.express as px

error_col = "mape_ml_total_orders"
df_ml_merged = df_ml_merged[df_ml_merged["yyyyww"]>=202333]
df_ml_merged = df_ml_merged.sort_values(by=["num_weeks_to_cut_off","cut_off_date"])
fig = px.line(
    df_ml_merged,
    x="cut_off_date",
    y=error_col,
    color="num_weeks_to_cut_off",
    title=error_col,
    markers=True,
    hover_data=["yyyyww", "num_total_orders_ml", "num_total_orders"]
)
fig.write_html(fig_dir/f"{error_col}_{company}.html")

In [None]:
import plotly.express as px

error_col = "mape_manual_dishes_orders"
df_manual_forecast_merged = df_manual_forecast_merged.sort_values(by="cut_off_date")
fig = px.line(
    df_manual_forecast_merged,
    x="cut_off_date",
    y=error_col,
    color="num_weeks_to_cut_off",
    title=error_col,
    markers=True,
    hover_data=["yyyyww", "num_dishes_orders_manual", "num_dishes_orders"]
)
fig.write_html(fig_dir/f"{error_col}_{company}.html")

In [None]:
import plotly.express as px

error_col = "mape_ml_dishes_orders"
df_ml_merged = df_ml_merged.sort_values(by="cut_off_date")
fig = px.line(
    df_ml_merged,
    x="cut_off_date",
    y=error_col,
    color="num_weeks_to_cut_off",
    title=error_col,
    markers=True,
    hover_data=["yyyyww", "num_dishes_orders_ml", "num_dishes_orders"]
)
fig.write_html(fig_dir/f"{error_col}_{company}.html")

# What if we use other dishes forecasts?

In [None]:
ml_dir = f"forecasting_ml/orders/predictions/{company}/"

df_list, file_names = download_csv(
    datalake_handler=datalake_handler,
    file_prefix="pred_num_total_orders",
    file_suffix=".csv",
    file_directory=ml_dir,
)

df_list_pred_total_orders = []
for df, file_name in zip(df_list, file_names):
    df_list_pred_total_orders.append(df)
    timestamp = file_name[-23:-4]
    df["pred_timestamp"] = timestamp
    df["pred_timestamp"] = pd.to_datetime(df["pred_timestamp"])

df_pred_total_orders = pd.concat(df_list_pred_total_orders)

df_list, file_names = download_csv(
    datalake_handler=datalake_handler,
    file_prefix="pred_perc",
    file_suffix=".csv",
    file_directory=ml_dir,
)

df_list_pred_perc = []
for df, file_name in zip(df_list, file_names):
    df_list_pred_perc.append(df)
    timestamp = file_name[-23:-4]
    df["pred_timestamp"] = timestamp
    df["pred_timestamp"] = pd.to_datetime(df["pred_timestamp"])

df_pred_perc = pd.concat(df_list_pred_perc)

df_list, file_names = download_csv(
    datalake_handler=datalake_handler,
    file_prefix="pred_num_dishes",
    file_suffix=".csv",
    file_directory=ml_dir,
)

df_list_num_dishes = []
for df, file_name in zip(df_list, file_names):
    df_list_num_dishes.append(df)
    timestamp = file_name[-23:-4]
    df["pred_timestamp"] = timestamp
    df["pred_timestamp"] = pd.to_datetime(df["pred_timestamp"])

df_num_dishes = pd.concat(df_list_num_dishes)


In [None]:
df_perc = df_pred_perc[["year", "week", "estimation_date", "pred"]].drop_duplicates(
    subset=["year", "week", "estimation_date"]
).rename(columns={"pred": "perc_pred"})

df_total_orders = df_pred_total_orders[["year", "week", "estimation_date", "pred"]].drop_duplicates(
    subset=["year", "week", "estimation_date"]
).rename(columns={"pred": "total_orders_pred"})

df_ml_dishes_perc = df_perc.merge(
    df_total_orders,
    on=["year", "week", "estimation_date"],
    how="inner"
)

df_ml_dishes_perc["pred"] = df_ml_dishes_perc["perc_pred"] * df_ml_dishes_perc["total_orders_pred"]


In [None]:
df_ml_dishes_direct = df_num_dishes[["year", "week", "estimation_date", "pred"]].drop_duplicates(
    subset=["year", "week", "estimation_date"]
)

In [None]:
df_ml_dishes_direct_merged = df_ml_dishes_direct.merge(
    df_order_history[["year", "week", "num_dishes_orders"]],
    how="left"
).sort_values(
    by=["estimation_date", "year", "week"]
).dropna()

df_ml_dishes_perc_merged = df_ml_dishes_perc.merge(
    df_order_history[["year", "week", "num_dishes_orders"]],
    how="inner"
).sort_values(
    by=["estimation_date", "year", "week"]
).dropna()

In [None]:
df_ml_dishes_direct_merged["error"] = df_ml_dishes_direct_merged["pred"] - df_ml_dishes_direct_merged["num_dishes_orders"]
df_ml_dishes_direct_merged["mape"] = abs(df_ml_dishes_direct_merged["error"])/df_ml_dishes_direct_merged["num_dishes_orders"]
df_ml_dishes_direct_merged

In [None]:
df_ml_dishes_perc_merged.head()

df_ml_dishes_perc_merged["error"] = df_ml_dishes_perc_merged["pred"] - df_ml_dishes_perc_merged["num_dishes_orders"]
df_ml_dishes_perc_merged["mape"] = abs(df_ml_dishes_perc_merged["error"])/df_ml_dishes_perc_merged["num_dishes_orders"]
df_ml_dishes_perc_merged

In [None]:
df_ml_dishes_perc_merged["mape"].describe()

In [None]:
df_ml_dishes_direct_merged["mape"].describe()

In [None]:
df_ml_dishes_direct_merged[df_ml_dishes_direct_merged["year"] == 2024]["mape"].describe()

In [None]:
df_ml_dishes_perc_merged[df_ml_dishes_perc_merged["year"] == 2024]["mape"].describe()