In [None]:
import sys
import os, json

path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)  # move up
sys.path.append(os.getcwd() + "\\cheffelo_personalization.menu_optimization")
import pandas as pd
import numpy as np
from optimization import *
import math
from lmkgroup_ds_utils.azure.storage import BlobConnector

In [None]:
def get_PIM_data():

    datalake_handler = BlobConnector(
    local = True
    )   

    url = "https://gganalyticsdatalake.blob.core.windows.net/data-science/test-folder/MP20/PIM_RecipeBank_GL_QA"
    df = datalake_handler.download_json_to_df(url=url)

    return df



In [None]:
ING_COLUMN = 'mainIngredientId'
TAX_COLUMN = 'taxonomiesId'
RECIPE_ID_COLUMN = 'recipeId'
PRICE_COLUMN = 'price'
RATING_COLUMN = 'averageRating'


## Input management

In [None]:
#Example of input json
input_json = {
	"companyId": "09ecd4f0-ae58-4539-8e8f-9275b1859a19",
	"week": 16,
	"year": 2023,
	"numRecipes": 20,
	"rules":{
		"taxonomies":[
			{
				"taxonomiesId": 1019,
				"quantity": 10,
				"taxonomyTypeId": 1
			}
		
		],
		"mainIngredients": [
			{
				"mainIngredientId": 1,
				"quantity": 11
			}
		],
		"minAverageRating": 3.1,
		"minRecipeCost": 70,
		"maxRecipeCost": 120,
		"excludeIngredients":[3]
	}
}

In [None]:
# Parse input data
num_recipes = input_json["numRecipes"]
ingredients = input_json["rules"]["mainIngredients"]
taxonomies = input_json["rules"]["taxonomies"]
ingredients_to_exclude = input_json["rules"]["excludeIngredients"]
week = input_json["week"]
year = input_json["year"]
company_id = input_json["companyId"]
min_rating = input_json["rules"]["minAverageRating"]
min_recipe_cost = input_json["rules"]["minRecipeCost"]
max_recipe_cost = input_json["rules"]["maxRecipeCost"]

In [None]:
ingredients

In [None]:
def get_ing_dist(ingredients, df):
   
    ings = []
    not_full = []
    for ing in ingredients:
        output = {
                ING_COLUMN: ing[ING_COLUMN],
                'wanted': ing['quantity'],
                'actual': df[f"{ING_COLUMN}_{ing[ING_COLUMN]}"].sum()
            }
        ings.append(output)
        if output['wanted'] != output['actual']:
            not_full.append(output[ING_COLUMN])
            
    return ings, not_full


In [None]:
def get_tax_dist(taxonomies, df):
   
    taxs = []
    not_full = []
    for tax in taxonomies:
        
        output = {
                TAX_COLUMN: tax[TAX_COLUMN],
                'taxonomyTypeId': tax['taxonomyTypeId'],
                'wanted': tax['quantity'],
                'actual': df[f"{TAX_COLUMN}_{tax[TAX_COLUMN]}"].sum()
            }
        taxs.append(output)
        
        if output['wanted'] != output['actual']:
            not_full.append(output[TAX_COLUMN])
    return taxs, not_full

 Joining Taxonomies with Main ingredients

In [None]:
dist = {}

#Ingredients first to prioritize
for ingredient in ingredients:
    dist[f"{ING_COLUMN}_{ingredient[ING_COLUMN]}"] = ingredient['quantity']

for taxonomy in taxonomies:
    dist[f"{TAX_COLUMN}_{taxonomy[TAX_COLUMN]}"] = taxonomy['quantity']




## Data Processing

It is assumed data from PIM comes with the formatation of the cell below

In [None]:
PIM_data = get_PIM_data().dropna()
PIM_data

In [None]:
PIM_data.info()

Exclude columns based on exclude items and average rating

In [None]:
def exclude_recipes(df, min_rating, ingredients_to_exclude, min_recipe_cost, max_recipe_cost):

    df_after_excluded = df[df[RATING_COLUMN] >= min_rating]
    df_after_excluded = df_after_excluded[~df_after_excluded[ING_COLUMN].isin(ingredients_to_exclude)]
    df_after_excluded = df_after_excluded[(df_after_excluded[PRICE_COLUMN] >= min_recipe_cost) & (df_after_excluded[PRICE_COLUMN] < max_recipe_cost)]
    df_after_excluded = df_after_excluded[[RECIPE_ID_COLUMN, TAX_COLUMN, ING_COLUMN, RATING_COLUMN]]

    return df_after_excluded

In [None]:
df_after_excluded = exclude_recipes(PIM_data, min_rating, ingredients_to_exclude, min_recipe_cost, max_recipe_cost)
df_after_excluded

In [None]:
df_after_excluded[ING_COLUMN].unique()

In [None]:
df_grouped = df_after_excluded[[RECIPE_ID_COLUMN, RATING_COLUMN]].groupby(RECIPE_ID_COLUMN).mean()
df_grouped

In [None]:
def get_dummies(df, df_grouped, dist):

    dummies = pd.get_dummies(data=df, columns=[TAX_COLUMN, ING_COLUMN])
    dummies_df = df[[RECIPE_ID_COLUMN]].set_index(RECIPE_ID_COLUMN).join(dummies.drop(RATING_COLUMN,axis = 1).set_index(RECIPE_ID_COLUMN))
    out = dummies_df.groupby([RECIPE_ID_COLUMN]).sum()
    out = out.applymap(lambda x: 1 if x > 0 else x)
    out = out.join(df_grouped).sort_values(RATING_COLUMN, ascending = False)
    out["n_overlay"] = out[list(dist.keys())].sum(axis=1) - 1

    return out


In [None]:
out = get_dummies(df_after_excluded, df_grouped, dist)

In [None]:
rest_df = out[out["n_overlay"] < 0]
wanted_df = out[out["n_overlay"] >= 0]
wanted_df


In [None]:
n_overlays = max(0, sum(dist.values()) - num_recipes)
n_overlays

In [None]:
def update_dist(dist, df, wanted_df):
    actual = {}
    wanted_df = wanted_df.drop(df.index)
    for protein in dist.keys():
        new_quantity = dist[protein] - df[protein].sum()
        if not new_quantity:
            wanted_df = wanted_df[wanted_df[protein] == 0]
            continue
        actual[protein] = new_quantity
    return actual, wanted_df

In [None]:
dist

In [None]:
wanted_df

In [None]:
final_df = pd.DataFrame()

while len(final_df) < num_recipes and len(wanted_df) > 0:

    mult_df = wanted_df[wanted_df['n_overlay'] == max(wanted_df['n_overlay'])]
    row = mult_df.sample(1)
    final_df = pd.concat([final_df, row])

    dist, wanted_df = update_dist(dist, row, wanted_df)
    print(dist)
    wanted_df["n_overlay"] = wanted_df[list(dist.keys())].sum(axis=1) - 1
    wanted_df = wanted_df[wanted_df["n_overlay"] >= 0]
    
    

In [None]:
len(final_df)

In [None]:
rest_df

In [None]:
# If there are not enough recipes, get random recipes with different main ingredients.
if len(final_df) < num_recipes:
    remaining = num_recipes - len(final_df)

    remaining_df = rest_df.sample(remaining)

    final_df = pd.concat([final_df, remaining_df])

In [None]:
len(final_df)

In [None]:
ingredients

In [None]:
def get_msg(ings_not_full, taxs_not_full):

    

    if not len(ings_not_full + taxs_not_full):
        return (0, "SUCCESS")
    
    msg = ''
    if len(taxs_not_full):
    
        msg += f"TAXONOMIES {str(taxs_not_full)[1:-1]} not fulfilled."
    if len(ings_not_full):
        
        msg += f"INGREDIENTS {str(ings_not_full)[1:-1]} not fulfilled."

    return (1,f"WARNING! {msg}")

In [None]:
ings_out, ings_not_full = get_ing_dist(ingredients, final_df)
tax_out, taxs_not_full = get_tax_dist(taxonomies, final_df)
status, msg = get_msg(ings_not_full, taxs_not_full)
output = {
        "company_id": company_id,
        "week": week,
        "year": year,
        "ingredients": ings_out ,
        "taxonomies": tax_out,
        "STATUS": status,
        "STATUS_MSG": msg,
        "recipes": PIM_data[PIM_data[RECIPE_ID_COLUMN].isin(final_df.index)][
            [RECIPE_ID_COLUMN, ING_COLUMN]
        ].drop_duplicates().to_dict(orient="records"),
    }

In [None]:
output