In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from reci_pick.db import get_serverless_spark_session
spark = get_serverless_spark_session()

# Explore data needed

## Which users have ordered what recipe and when

In [3]:
from constants.companies import get_company_by_code
company_code = "RT"
company_id = get_company_by_code(company_code).company_id
start_year = 2023

In [5]:
from reci_pick.db import get_data_from_sql
from reci_pick.paths import TRAIN_SQL_DIR
df_recipes_spark = get_data_from_sql(
    spark,
    TRAIN_SQL_DIR / "all_recipes.sql",
    start_year=start_year,
    company_id=company_id,
)
df_recipes = df_recipes_spark.toPandas()


In [6]:
from reci_pick.db import get_data_from_sql
from reci_pick.paths import TRAIN_SQL_DIR
df_menu_recipes_spark = get_data_from_sql(
    spark,
    TRAIN_SQL_DIR / "menu_recipes.sql",
    start_year=start_year,
    company_id=company_id,
)
df_menu_recipes = df_menu_recipes_spark.toPandas()

In [7]:
from reci_pick.db import get_data_from_sql
from reci_pick.paths import TRAIN_SQL_DIR
df_order_history_spark = get_data_from_sql(
    spark,
    TRAIN_SQL_DIR / "order_history.sql",
    start_year=start_year,
    company_id=company_id,
)
df_order_history = df_order_history_spark.toPandas()
df_order_history["concept_combination_list"] = df_order_history["concept_combinations"].str.split(", ").tolist()
# Potentially remove recipes that are not in the look up table
df_order_history = df_order_history[df_order_history["main_recipe_id"].isin(df_recipes["main_recipe_id"])]

In [8]:
from reci_pick.train.baseline import encode_recipe_names
id_to_embedding_lookup, id_to_name_lookup = encode_recipe_names(
    df_recipes = df_recipes
)

In [9]:
from reci_pick.train.preprocessing import  split_train_test
df_order_history_train, df_order_history_test, split_yyyyww = split_train_test(
    df_order_history=df_order_history,
    split_yyyyww=202503
)

df_users_to_predict = df_order_history_train[
    df_order_history_train["billing_agreement_id"]
    .isin(df_order_history_test.billing_agreement_id.unique())
]

In [None]:
from reci_pick.train.baseline import make_user_embedding_profile
df_users_to_predict = df_order_history_train[
    df_order_history_train["billing_agreement_id"].isin(df_order_history_test.billing_agreement_id.unique())
]
user_positive_embeddings = make_user_embedding_profile(
    df_purchase_history=df_users_to_predict, id_to_embedding_lookup=id_to_embedding_lookup
)

In [None]:
import pandas as pd
from reci_pick.train.preprocessing import get_menus_to_predict
df_menu_to_predict = get_menus_to_predict(
    df_menu_recipes=df_menu_recipes,
    df_order_history_test=df_order_history_test
)

df_menu_to_predict


In [None]:
from reci_pick.train.baseline import make_menu_recommendations
df_recommendations = make_menu_recommendations(
    df_menu_to_predict=df_menu_to_predict,
    user_positive_embeddings=user_positive_embeddings,
    id_to_embedding_lookup=id_to_embedding_lookup,
    id_to_name_lookup=id_to_name_lookup,
    top_n=8
)

In [None]:
df_recommendations.head()

In [14]:
sylvia_agreement_id = 1350590

In [15]:
from reci_pick.train.baseline import compute_precision
df_precision_at_k = compute_precision(
    df_order_history_test=df_order_history_test,
    df_recommendations=df_recommendations
)

In [None]:
df_precision_at_k.groupby("menu_week")["num_purchased_recommendations"].mean()

In [None]:
(df_precision_at_k["num_purchased_recommendations"] >=1).shape

In [None]:
(df_precision_at_k["num_purchased_recommendations"] >=1).mean()

# Preselector version

In [17]:
df_recipe_main_recipe_mapping = spark.sql(
    "select distinct recipe_id, main_recipe_id from prod.gold.dim_recipes"
).toPandas()

recipe_id_to_main_recipe_id_lookup = df_recipe_main_recipe_mapping.set_index("recipe_id").to_dict()["main_recipe_id"]

In [18]:
from reci_pick.paths import PROJECT_DIR
DATA_DIR = PROJECT_DIR / "data"

In [19]:
from datetime import datetime
from pandas import Timedelta
import pytz
def get_cut_off_date(
    menu_year: int,
    menu_week: int,
    cut_off_weekday: int
):
    monday_menu_week = datetime.fromisocalendar(menu_year, menu_week, 1)
    cut_off_date = monday_menu_week + Timedelta(days=-7) + Timedelta(days=cut_off_weekday)
    cut_off_date = cut_off_date.astimezone(pytz.timezone("UTC"))
    return cut_off_date

In [None]:
import pandas as pd
weeks = [3, 4, 5, 6, 7, 8]
df_list = []
for week in weeks:
    print(f"week = {week}")
    file_name = f"{company_code}_20250{week}.parquet"
    df = pd.read_parquet(
        DATA_DIR / f"preselector_recs/{company_code}/{file_name}"
    )

    cut_off_date = get_cut_off_date(
        cut_off_weekday=2,
        menu_week=week,
        menu_year=2025
    )

    df["predicted_at"] = pd.to_datetime(df["predicted_at"])
    df["is_before_cutoff"] = df["predicted_at"] < cut_off_date

    df = df[df["is_before_cutoff"]]
    df = df.sort_values(by="predicted_at", ascending=False)
    df = df.drop_duplicates(subset=["agreement_id", "recipe_id"], keep="first")
    df = df.sort_values(by="score", ascending=False)
    df_topn = df.groupby("agreement_id").head(8)
    df_top_n_aggregated = pd.DataFrame(df_topn.groupby(["agreement_id", "predicted_at"])[["recipe_id", "score"]].agg(list)).reset_index()
    df_top_n_aggregated["menu_year"] = 2025
    df_top_n_aggregated["menu_week"] = week
    df_list.append(df_top_n_aggregated)


In [21]:
from reci_pick.helpers import get_dict_values
def replace_recipe_id_with_main_recipe_id (
    df_recommendations_preselector: pd.DataFrame,
) -> pd.DataFrame:
    main_recipe_id_list = []
    for i in df_recommendations_preselector.itertuples():
        recipe_ids = i.recipe_id
        main_recipe_ids = get_dict_values(
            look_up_dict=recipe_id_to_main_recipe_id_lookup,
            key_list=recipe_ids
        )
        main_recipe_id_list.append(main_recipe_ids)
    df_recommendations_preselector["main_recipe_ids"] = pd.Series(main_recipe_id_list)
    df_recommendations_preselector = df_recommendations_preselector.rename(columns={"main_recipe_ids": "top_n_recipe_ids"})

    return df_recommendations_preselector

In [22]:
df_concated = pd.concat(
    df_list, ignore_index=True
)
df_concated = replace_recipe_id_with_main_recipe_id(df_concated)
df_concated  = df_concated.rename(
    columns={
        "agreement_id": "billing_agreement_id",
        "main_recipe_ids": "top_n_recipe_ids",
    }
)

In [23]:
df_precision_preselector = compute_precision(
    df_order_history_test=df_order_history_test,
    df_recommendations=df_concated[df_concated.billing_agreement_id.isin(df_recommendations.billing_agreement_id)]
)

In [None]:
df_precision_at_k.groupby(
    ["menu_week"]
)["num_purchased_recommendations"].mean()

In [None]:
df_precision_preselector.groupby(
    ["menu_week"]
)["num_purchased_recommendations"].mean()

In [None]:
(df_precision_preselector["num_purchased_recommendations"] >=1).mean()

In [None]:
from reci_pick.train.preprocessing import get_top_n_dishes_per_concept
df_top_n_dishes_per_concept = get_top_n_dishes_per_concept(
    df_order_history=df_order_history,
    top_n=10,
    look_back_weeks=12,
    split_yyyyww=None,
)

df_top_n_dishes_per_concept

# Front End Version

In [None]:
from reci_pick.paths import PROJECT_DIR
import os

DATA_DIR = PROJECT_DIR / "data"
result_dir = DATA_DIR / "front_end" / f"{company_code}"
file_list = os.listdir(result_dir)
file_list[-1]

In [None]:
file_full_path = result_dir / file_list[-1]
df_recommendations_fe = pd.read_csv(file_full_path)
df_recommendations_fe_top_8 = df_recommendations_fe[df_recommendations_fe["order_of_relevance"] <= 8 ]
df_recommendations_fe_top_8["product_id"] = df_recommendations_fe_top_8.product_id.str.upper()

In [28]:
df_recommendations_fe_agged = pd.DataFrame(
    df_recommendations_fe_top_8.groupby(["agreement_id","year","week"])[["product_id", "order_of_relevance"]].agg(list)
).reset_index()

In [None]:
df_recommendations_fe_agged = df_recommendations_fe_agged.rename(
    columns={
        "agreement_id": "billing_agreement_id",
        "year": "menu_year",
        "week": "menu_week",
    }
)

df_recommendations_fe_agged

In [30]:
product_id_to_main_recipe_mapping = df_menu_recipes.set_index("product_id")["main_recipe_id"].to_dict()

In [31]:
from reci_pick.helpers import get_dict_values
def replace_product_id_with_main_recipe_id (
    df_recommendations_fe: pd.DataFrame,
) -> pd.DataFrame:
    main_recipe_id_list = []
    for i in df_recommendations_fe.itertuples():
        product_ids = i.product_id
        main_recipe_ids = get_dict_values(
            look_up_dict=product_id_to_main_recipe_mapping,
            key_list=product_ids
        )
        main_recipe_id_list.append(main_recipe_ids)
    df_recommendations_fe["top_n_recipe_ids"] = pd.Series(main_recipe_id_list)

    return df_recommendations_fe

In [None]:
df_recommendations_fe_agged = replace_product_id_with_main_recipe_id(
    df_recommendations_fe=df_recommendations_fe_agged
)
df_recommendations_fe_agged


In [None]:
df_precision_fe = compute_precision(
    df_order_history_test=df_order_history_test,
    df_recommendations=df_recommendations_fe_agged
)

df_precision_fe.groupby("menu_week")["billing_agreement_id"].nunique()

In [None]:
df_precision_fe.groupby("menu_week")["num_purchased_recommendations"].describe()

In [131]:
sylvias_recs = df_precision_fe[df_precision_fe.billing_agreement_id == sylvia_agreement_id]["top_n_recipe_ids"].values
recipes_recs = []
for arr in sylvias_recs:
    recs = get_dict_values(
        key_list=arr,
        look_up_dict=id_to_name_lookup
    )
    recipes_recs.append(recs)

In [None]:
recipes_recs

In [None]:
recipes_recs