In [None]:
import dotenv
from aligned import FeatureStore

dotenv.load_dotenv()

In [None]:
store = await FeatureStore.from_dir(".")

In [None]:
preds = await store.model("user_recipe_likability").all_predictions().to_pandas()
preds[preds["agreement_id"] == 1312655]

In [None]:
recs = await store.model("rec_engine").all_predictions().to_polars()

In [None]:
recs.collect()

In [None]:
recs.collect()

In [None]:
import polars as pl
results = recs.filter(pl.col("order_of_relevance_cluster") <= 8).collect()

In [None]:
import polars as pl
data = pl.read_json("/Users/mats.mollestad/Downloads/PIM_RecipeBank_GL_Quality")

In [None]:
data

In [None]:
data.write_parquet("/Users/mats.mollestad/Downloads/PIM_RecipeBank_GL_Quality.parquet")

In [None]:
selected_results = results.filter(pl.col("predicted_at") == pl.col("predicted_at").max())
selected_results

In [None]:
selected_results.select(pl.col("company_id").unique())[0,0]

In [None]:
from datetime import datetime, date
from cheffelo_personalization.rec_engine.data.recommendations import RecommendatedDish
from cheffelo_personalization.rec_engine.data.recipe import RecipeTaxonomies

class Orders:
    year: int
    week: int
    agreement_id: int
    order_id: str
    recipe_name: str
    product_id: str
    product_name: str
    gross_revenue_ex_vat: float
    company_id: str
    updated_at: datetime
    cutoff_day: date
    businessLogic_day: int
    deviation_week: int
    
async def evaluate_recommendations(recommendations: pl.DataFrame, orders: pl.DataFrame):
    # Get the ordered dishes for a week
    # Get the shown recommendations
    # Compute the number of recommended dishes and the purchased dishes
    RecommendatedDish.raise_if_invalid(recommendations)
    Orders.raise_if_invalid(orders)
    
    recommendation_for_order = recommendations.join(orders, on=["agreement_id", "year", "week", "product_id"])
    

    

from aligned import FeatureStore

store = await FeatureStore.from_dir(".")

entities = {
    "main_recipe_id": [100, 101],
    "agreement_id": [1312655, 1312655]
}

predictions = await store.model("user_recipe_likability")\
    .predictions_for(entities)\
    .to_pandas()


In [None]:
from cheffelo_personalization.rec_engine.sources import adb_ml_output

baseline_entities = adb_ml_output.fetch("SELECT agreement_id, year, week, product_id, run_timestamp as event_timestamp FROM ml_output.recommendations WHERE company_id = '09ECD4F0-AE58-4539-8E8F-9275B1859A19' AND run_timestamp >= '2023-11-15' AND order_of_relevance_cluster <= 8")

baseline = await store.model("rec_engine").using_source(
    adb_ml_output.table("recommendations", mapping_keys={"run_timestamp": "predicted_at"})
).predictions_for(baseline_entities, event_timestamp_column="event_timestamp").to_polars()

In [None]:
baseline.collect().write_parquet("/Users/mats.mollestad/Desktop/Cheffelo/baseline_recommendations.parquet")

In [None]:
baseline_selection = baseline.groupby(["agreement_id", "year", "week"]).agg([
    pl.col("product_id").alias("product_ids")
]).collect()

In [None]:
new_selection = selected_results.groupby(["agreement_id", "year", "week"]).agg([
    pl.col("product_id").alias("proudct_ids_new")
])

In [None]:
new_selection.join(baseline_selection, on=["agreement_id", "year", "week"]).select(
    interesection=pl.col("proudct_ids_new").list.set_intersection("product_ids").list.lengths(),
    union = pl.col("proudct_ids_new").list.set_union("product_ids"),
).with_columns(
    jac_sim=pl.col("interesection") / pl.col("union").list.lengths(),
    sim_prec=pl.col("interesection") / 8,
).describe()

In [None]:
from cheffelo_personalization.rec_engine.sources import azure_dl_creds

pre_selector = await azure_dl_creds.csv_at("data-science/personalization/preselector/prod/results/09ECD4F0-AE58-4539-8E8F-9275B1859A19/2023-46/latest.csv").to_pandas()

In [None]:
pre_selector

In [None]:
selection = await store.feature_view("weekly_products").all().to_pandas()
selection

In [None]:
selection[selection["is_default_raw"].isna()]

In [None]:
selection["is_default"].isna().value_counts()

In [None]:
selection[~selection["recipe_id"].isna()]

In [None]:
selection[(selection["menu_week"] == 50) & (selection["menu_year"] == 2023) & (selection["company_id"].str.lower() == "09ecd4f0-ae58-4539-8e8f-9275b1859a19")]

In [None]:
store.models.keys()

In [None]:
recs = await store.model("presented_recommendations").all_predictions().to_pandas()

In [None]:
recs["company_id"].unique()

In [None]:
gl_id = "09ECD4F0-AE58-4539-8E8F-9275B1859A19"

In [None]:
recs[recs["company_id"] == gl_id]

In [None]:
store.model("presented_recommendations").model.predictions_view.application_source

In [None]:
await store.model("presented_recommendations").using_source(
    store.model("presented_recommendations").model.predictions_view.application_source
).upsert_predictions(
    recs[recs["company_id"] == gl_id]
)

In [None]:
store.feature_views.keys()

In [None]:
from cheffelo_personalization.rec_engine.update_source import update_from_staging

await update_from_staging([
    "ordered_recipes"
], store)

In [None]:
data = await store.feature_view("ordered_recipes").all().to_polars()

In [None]:
data.collect()

In [None]:
rankings = await store.model("rec_engine").predictions_for({
})
rankings

In [None]:
from cheffelo_personalization.rec_engine.run import format_ranking_recommendations

formatted_recommendations = format_ranking_recommendations(
    rankings, 8
)
formatted_recommendations["company_id"] = rankings["company_id"].unique()[0]
formatted_recommendations["run_timestamp"] = rankings["predicted_at"].unique()[0]


In [None]:
formatted_recommendations

In [None]:
store.model("presented_recommendations").model.predictions_view.application_source

In [None]:
formatted_recommendations[formatted_recommendations["agreement_id"] == 1312655]

In [None]:
await store.model("presented_recommendations").using_source(
    store.model("presented_recommendations").model.predictions_view.application_source
).upsert_predictions(
    formatted_recommendations
)

In [None]:
store.feature_views.keys()

In [None]:
orders = await store.feature_view("ordered_recipes").all().to_polars()

In [None]:
import polars as pl
orders.filter(pl.col("agreement_id") == 998320).collect()

In [None]:
formatted_recommendations[formatted_recommendations["agreement_id"] == 998320]

In [None]:
rankings[(rankings["agreement_id"] == 1091275) & (rankings["week"] == 47)].sort_values("order_of_relevance_cluster")

In [None]:
from cheffelo_personalization.rec_engine.evaluate import evaluate_predictions

In [None]:
await store.feature_view("recipe_taxonomies").all().to_pandas()

In [None]:
from cheffelo_personalization.rec_engine.sources import model_preds
predictions = await store.model("presented_recommendations").using_source(model_preds.parquet_at("formatted_recommendations.parquet")).all_predictions().to_pandas()

In [None]:
predictions

In [None]:
from cheffelo_personalization.sql_server import SqlServerConfig


source = SqlServerConfig('').table(
    "latest_recommendations", mapping_keys={"run_timestamp": "predicted_at"}
)
other = source.feature_identifier_for(["predicted_at"])

renames = dict(zip(["predicted_at"], other))
renames

In [None]:
rankings.rename(columns=renames)

In [None]:
import asyncpg
from cheffelo_personalization.rec_engine.sources import segment_personas_db

conn = await asyncpg.connect(segment_personas_db.url)

In [None]:
from cheffelo_personalization.rec_engine.sources import model_preds

(await model_preds.parquet_at("recommendation_products.parquet").to_polars()).head().collect()

In [None]:
(await model_preds.parquet_at("user_recipe_likability.parquet").to_polars()).head().collect()

In [None]:
predictions.shape

In [None]:
await store.model("presented_recommendations").upsert_predictions(predictions)

In [None]:
query = """
INSERT INTO recommendations(company_id, recommendation_json, agreement_id, run_timestamp)
        VALUES ($1, $2, $3, $4)
        ON CONFLICT (agreement_id)
        DO UPDATE SET company_id = EXCLUDED.company_id, recommendation_json = EXCLUDED.recommendation_json, run_timestamp = EXCLUDED.run_timestamp
"""

In [None]:
import polars as pl
predictions = pl.from_pandas(predictions).unique("agreement_id")

In [None]:
predictions.with_columns(
    pl.col("run_timestamp").cast(pl.Utf8)
)

In [None]:
await conn.executemany(query, predictions.with_columns(
    pl.col("run_timestamp").cast(pl.Utf8)
).select(["company_id", "recommendation_json", "agreement_id", "run_timestamp"]).to_numpy())

In [None]:
taxonomies = await store.feature_view("recipe_taxonomies").features_for({
    "main_recipe_id": rankings["main_recipe_id"].unique()
}).to_pandas()

In [None]:
taxonomies

In [None]:
joined = predictions.merge(taxonomies, how="inner", on="recipe_id")

In [None]:
joined

In [None]:
crate_taxonomies = joined.groupby(["agreement_id", "year", "week"])["recipe_taxonomies"].apply(lambda group: ",".join(group)).str.split(",").reset_index()

In [None]:
with_count = crate_taxonomies.assign(
    unique_taxonomie_count=crate_taxonomies["recipe_taxonomies"].apply(lambda row: len(set(row)))
)

In [None]:
with_count

In [None]:
set(with_count["recipe_taxonomies"].iloc[0])

In [None]:
from cheffelo_personalization.rec_engine.sources import adb

baseline_preds = await store.model("rec_engine").using_source(
    adb.with_schema("ml_output").table("latest_recommendations", mapping_keys={
        "run_timestamp": "predicted_at",
        "product_id": "recipe_id"
    })
).all_predictions().cached_at("/Users/mats.mollestad/Desktop/Cheffelo/latest_recommendations_2023_11_7.parquet").to_pandas()

In [None]:
baseline_preds

In [None]:
baseline_subset = baseline_preds[baseline_preds["order_of_relevance_cluster"] < 4]

In [None]:
predictions

In [None]:
product_recipe_map = (predictions["product_id"] + predictions["year"].astype(str) + predictions["week"].astype(str) + ":" + predictions["recipe_id"].astype(str)).unique()

In [None]:
product_recipe_map = {row.split(":")[0]: row.split(":")[1] for row in product_recipe_map }

In [None]:
baseline_subset = baseline_subset.assign(
    recipe_id=baseline_subset[["product_id", "year", "week"]].apply(
        lambda value: product_recipe_map.get(
            value["product_id"] + str(value["year"]) + str(value["week"])
        ), axis=1
    )
)

In [None]:
baseline_subset["recipe_id"].isnull().value_counts()

In [None]:
baseline_subset = baseline_subset[~baseline_subset["recipe_id"].isnull()]
baseline_subset["recipe_id"] = baseline_subset["recipe_id"].astype("int")

In [None]:
baseline_subset

In [None]:
predictions["agreement_id"].unique().shape

In [None]:
baseline_subset["agreement_id"].unique().shape

In [None]:
predictions["recipe_id"].unique().shape

In [None]:
baseline_subset["recipe_id"].unique().shape

In [None]:
baseline_subset

In [None]:
import pandas as pd

def evaluate_predictions(
    predictions: pd.DataFrame,
    recipe_taxonomies: pd.DataFrame
) -> None:
    
    joined = predictions.merge(recipe_taxonomies, how="inner", on="main_recipe_id")
    crate_taxonomies = joined.groupby(["agreement_id", "year", "week"])[
        "recipe_taxonomies"
    ].apply(lambda group: ",".join(group)).str.split(",").reset_index()

    return crate_taxonomies.assign(
        unique_taxonomie_count=crate_taxonomies["recipe_taxonomies"].apply(lambda row: len(set(row)))
    )

In [None]:
baseline_with_recipe_id = baseline_preds.assign(
    recipe_id=baseline_preds[["product_id", "year", "week"]].apply(
        lambda value: product_recipe_map.get(
            value["product_id"] + str(value["year"]) + str(value["week"])
        ), axis=1
    )
)
baseline_with_recipe_id = baseline_with_recipe_id[~baseline_with_recipe_id["recipe_id"].isnull()]

In [None]:
baseline_with_recipe_id.shape

In [None]:
# Choosing 3 - 8 dishes
for i in range(2, 8):
    print(f"Choosing {i}")
    print("\nNew version")
    print(evaluate_predictions(rankings[rankings["order_of_relevance_cluster"] < i], taxonomies)["unique_taxonomie_count"].describe())
    # print("\nBaseline")
    # print(evaluate_predictions(baseline_with_recipe_id[baseline_with_recipe_id["order_of_relevance_cluster"] < i + 1], taxonomies)["unique_taxonomie_count"].describe())


In [None]:
baseline_with_recipe_id["recipe_id"] = baseline_with_recipe_id["recipe_id"].astype("int")

In [None]:
import pandas as pd
def recipe_distribution(df: pd.DataFrame):
    df["product_id"].hist(density=1, bins=df["product_id"].unique().shape[0])

In [None]:
baseline_subset = baseline_with_recipe_id[baseline_with_recipe_id["order_of_relevance_cluster"] <= 4]

In [None]:
recipe_distribution(baseline_subset[baseline_subset["week"] == 46])

In [None]:
recipe_distribution(rankings[(rankings["order_of_relevance_cluster"] < 5) & (rankings["week"] == 47)])

In [None]:
baseline_subset["recipe_id"].value_counts(normalize=True).describe()

In [None]:
rankings[(rankings["week"] == 47) & (rankings["order_of_relevance_cluster"] < 4)]["main_recipe_id"].value_counts(normalize=True).describe()

In [None]:
baseline_subset[baseline_subset["agreement_id"].isin(predictions["agreement_id"].unique())]["recipe_id"].value_counts()

In [None]:
baseline_subset[~baseline_subset["agreement_id"].isin(predictions["agreement_id"])]["agreement_id"].unique()

In [None]:
ratings = await store.feature_view("recipe_rating").all().to_pandas()

In [None]:
len(set(ratings[
    ratings["company_id"].isin(
        ['09ECD4F0-AE58-4539-8E8F-9275B1859A19', '5E65A955-7B1A-446C-B24F-CFE576BF52D7']
    ) & ~ratings["RATING"].isna()
]["agreement_id"].to_list()))

In [None]:
len(set(baseline_subset["agreement_id"].to_list()))

In [None]:
len(set(predictions["agreement_id"].to_list()))

In [None]:
baseline_subset["product_id"].value_counts(sort=True)

In [None]:
predictions["recipe_id"].value_counts(sort=True)

In [None]:
orders = pd.DataFrame({
    "agreement_id": [1, 1, 2, 2, 3],
    "recipe_id": [1, 2, 1, 2, 1],
    "rating": [1, 2, 2, 4, float("nan")]
})
recipes_ratings = orders[["recipe_id", "rating"]]
# Assign ratings based on avg per recipe_id instead of global avg
cleaned_reciperatings = (
    recipes_ratings.groupby("recipe_id")
    .mean()
    .sort_values("rating", ascending=False)
)
# For further recipe_ids that have no ratings whatsoever, we assign them 1 assuming customers don't like them at all
cleaned_reciperatingsNaNRemoved = pd.DataFrame(
    cleaned_reciperatings.rating.fillna(0)
)
# Reset index
cleaned_reciperatingsNaNRemoved.reset_index(inplace=True)
# Merge on original orders df
orders = orders.merge(
    cleaned_reciperatingsNaNRemoved, on="recipe_id", how="inner"
)
# Clean up and make it ready for further use

In [None]:
orders

In [None]:
ratings = await store.feature_view("recipe_rating").all().to_pandas()

In [None]:
ratings

In [None]:
ratings["rating"].describe()

In [None]:
ratings = pd.DataFrame({
    "agreement_id": [1, 1, 2, 2, 3],
    "recipe_id": [1, 2, 1, 2, 1],
    "rating": [1, 2, 2, 4, float("nan")]
})
mean_recipe_rating = (
    ratings.groupby("recipe_id")["rating"].mean()
).reset_index()
mean_recipe_rating

In [None]:
ratings["rating"] = ratings["rating"].fillna(
    ratings[["recipe_id"]].merge(mean_recipe_rating, on="recipe_id", how="left")["rating"]
).fillna(3)

In [None]:
ratings[["recipe_id"]].merge(mean_recipe_rating, on="recipe_id", how="left")

In [None]:
ratings["recipe_id"]

In [None]:
ratings

In [None]:
# KPI

# Who to compute for
# All delivered orders

# What to compute
# Percentage of recommended dishes in the delivered basket that was selected by the user

# What do I need
# The delivered basket
# If the recipes was recommended
# If the recieps was a default dish
# Who deviated from the "default" box

# Used features
# Who deviated
# If the product was default or not

# Entities = The delivered box with it's associated billing_agreement_basket, it's year, week
# Compare dataset = The recipes that was shown to the user when they made the last changes

In [None]:
from cheffelo_personalization.rec_engine.sources import adb

ordered_products = """SELECT 
    bab.id as billing_agreement_basket_id, 
    p.product_id, 
    ol.agreement_id, 
    ol.week, 
    ol.year, 
    le.last_edit
FROM def.order_lines ol
INNER JOIN cms.billing_agreement_basket bab ON bab.agreement_id = ol.agreement_id
INNER JOIN (
    SELECT year, week, billing_agreement_basket_id, MAX(updated_at) last_edit
    FROM cms.billing_agreement_basket_deviation
    WHERE year = 2023
    AND week = 46
    AND is_active = 1
    GROUP BY year, week, billing_agreement_basket_id
) le ON le.billing_agreement_basket_id = bab.id AND le.year = ol.year AND le.week = ol.week
INNER JOIN mb.products p ON p.variation_id = ol.variation_id AND ol.company_id = p.variation_company_id
WHERE product_type_id IN (
        'CAC333EA-EC15-4EEA-9D8D-2B9EF60EC0C1',
        '2F163D69-8AC1-6E0C-8793-FF0000804EB3'
    )"""

entities = adb.fetch(ordered_products)

In [None]:
from aligned import FeatureStore

store = await FeatureStore.from_dir(".")

In [None]:
df = await store.features_for(
    entities, 
    features=[
        # "model:rec_engine:order_of_relevance_cluster",
        "feature_view:basket_deviation:was_user",
        "feature_view:basket_deviation:was_meal_selector",
    ], 
    event_timestamp_column="last_edit"
).to_pandas()

In [None]:
df

In [None]:
df["was_meal_selector"].value_counts()

In [None]:
df["was_user"].value_counts()

In [None]:
predictions = await store.model("rec_engine").using_source(
    adb.with_schema("ml_output").table("recommendations", mapping_keys={"run_timestamp": "predicted_at"})
).predictions_for(
    entities,
    event_timestamp_column="last_edit"
).to_pandas()

In [None]:
predictions

In [None]:
import polars as pl

In [None]:
preds = pl.from_pandas(predictions)

In [None]:
preds.groupby("agreement_id").agg([pl.col("product_id").count().alias("product_count")]).filter(pl.col("product_count") == 2)


In [None]:
preds

In [None]:
preds.select(pl.col("order_of_relevance_cluster").null_count())

In [None]:
preds.select(pl.col("agreement_id").n_unique())

In [None]:
data = pl.concat([preds, pl.from_pandas(df[["was_meal_selector", "was_user"]])], how="horizontal")

In [None]:
data

In [None]:
def compute_metrics(df: pl.DataFrame) -> pl.DataFrame:
    return df.groupby("agreement_id").agg([
        pl.col("was_meal_selector").sum().alias("Meal Selector deviated count"),
        pl.col("was_user").sum().alias("User manually deviated count"),
        (pl.col("order_of_relevance_cluster") <= 8).sum().alias("Number of recommendations in top 8 rank"),
        ((pl.col("order_of_relevance_cluster") <= 8).sum() / pl.col("was_user").count()).alias("Percentage of selected recommendations in box that was top 8 (aka. hopfully shown)"),
        (((pl.col("order_of_relevance_cluster") <= 8) & pl.col("was_user")).sum() / pl.col("was_user").count()).alias("Percentage of selected recommendations in box that was top 8 and selected by the user"),
    ])

In [None]:
compute_metrics(data).describe()

In [None]:
from aligned.feature_source import BatchFeatureSource
source = store.feature_source
if not isinstance(source, BatchFeatureSource):
    raise ValueError()

source.sources["model:rec_engine"] = adb.with_schema("ml_output").table("recommendations", mapping_keys={"run_timestamp": "predicted_at"})

In [None]:
source.sources

In [None]:
eval_metrics = await store.with_source(source).features_for(entities, features=[
        "model:rec_engine:order_of_relevance_cluster",
        "feature_view:basket_deviation:was_user",
        "feature_view:basket_deviation:was_meal_selector",
    ],
    event_timestamp_column="last_edit"
).to_pandas()

In [None]:
eval_metrics

In [None]:
compute_metrics(pl.from_pandas(eval_metrics)).describe()