### Note: prior to running the notebook, ensure your kernel is correctly selected and preferably pointing to the venv associated with menu planning project

# 1. Load libraries

In [1]:
import random
import numpy as np
import pandas as pd
import sys

sys.path.append("../menu_optimiser")

#load correct ENV 
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [2]:
from data_contracts.helper import camel_to_snake
from data_contracts.sources import azure_dl_creds

# 2. Import data

In [3]:

df_recipes = await azure_dl_creds.directory("data-science").directory("test-folder").directory("MP20").parquet_at("PIM_RecipeBank_GL_PRD").to_pandas()

In [4]:
type(df_recipes)

pandas.core.frame.DataFrame

In [5]:
df_recipes.shape

(17946, 13)

In [6]:
df_recipes.shape

(17946, 13)

In [7]:
df_recipes.dtypes

RecipeId                     int32
MainRecipeId               float64
MainIngredientId           float64
NumberOfRatings            float64
AverageRating               object
TaxonomyId                   int32
Price                       object
IsUniverse                    bool
CookingTimeFrom              int32
CookingTimeTo                int32
PriceCategory                int32
CreatedAt           datetime64[ns]
UpdatedAt           datetime64[ns]
dtype: object

In [8]:
df_recipes.head(5)

Unnamed: 0,RecipeId,MainRecipeId,MainIngredientId,NumberOfRatings,AverageRating,TaxonomyId,Price,IsUniverse,CookingTimeFrom,CookingTimeTo,PriceCategory,CreatedAt,UpdatedAt
0,748,,2.0,,0.0,2079,0.0,False,25,30,0,2018-02-20 15:00:36.690,NaT
1,3232,,1.0,383.0,79.75,151,35.17,False,20,25,1,2018-02-20 15:00:36.690,2018-11-29 11:41:53.577
2,5381,,1.0,5.0,95.0,217,30.91,False,20,30,0,2018-05-16 15:12:22.977,2019-05-13 16:18:27.530
3,6757,,1.0,5.0,80.0,244,29.08,False,30,30,0,2018-08-06 14:00:38.593,2019-06-21 13:56:35.947
4,7226,,5.0,1.0,50.0,244,28.86,False,20,30,0,2018-08-31 13:17:47.613,2021-09-02 14:17:35.610


In [9]:
#Convert all columns to snake case
df_recipes.columns = pd.Index([camel_to_snake(x) for x in df_recipes.columns.to_list()])
df_recipes.sample(3)

Unnamed: 0,recipe_id,main_recipe_id,main_ingredient_id,number_of_ratings,average_rating,taxonomy_id,price,is_universe,cooking_time_from,cooking_time_to,price_category,created_at,updated_at
5209,63850,,5.0,827.0,79.75,3653,29.39,True,20,30,0,2022-02-28 14:56:50.897,2025-03-12 12:48:54.007
16730,83000,,1.0,16.0,65.75,2010,0.0,False,15,20,1,2024-02-20 08:19:37.667,2024-04-11 09:47:14.340
400,33324,,3.0,,0.0,1898,31.07,False,20,30,0,2021-01-26 16:32:37.470,2021-02-03 11:12:08.130


In [59]:

# Assuming df is your DataFrame with columns 'a' and 'b' where a=RecipeId and b=TaxonomyId


# Calculate the count of unique values in column 'b' for each unique value in column 'a'
#In other words, we are coutning the unique taxonomies per unique recipe, followed by filtering for recipes that have >= 7 taxonomies

count_values = df_recipes.groupby('recipe_id')['taxonomy_id'].nunique()

# Specify the threshold
threshold = 7  # Adjust this threshold as needed to simulate filtering, can be 7,8,9,10 etc

# Filter 'a' values based on the threshold
valid_a_values = count_values[count_values >= threshold].index.tolist()

# Filter the original DataFrame based on the valid 'a' values
filtered_df = df_recipes[df_recipes['recipe_id'].isin(valid_a_values)]

filtered_df

Unnamed: 0,recipe_id,main_recipe_id,main_ingredient_id,number_of_ratings,average_rating,taxonomy_id,price,is_universe,cooking_time_from,cooking_time_to,price_category,created_at,updated_at
261,26371,,1.0,1619.0,82.750000000000000000,922,34.250000000000000000,True,20,30,1,2018-06-26 07:30:52.247,2025-03-11 18:00:06.693
262,26371,,1.0,1619.0,82.750000000000000000,1212,34.250000000000000000,True,20,30,1,2018-06-26 07:30:52.247,2025-03-11 18:00:06.693
263,26371,,1.0,1619.0,82.750000000000000000,1213,34.250000000000000000,True,20,30,1,2018-06-26 07:30:52.247,2025-03-11 18:00:06.693
264,26371,,1.0,1619.0,82.750000000000000000,1837,34.250000000000000000,True,20,30,1,2018-06-26 07:30:52.247,2025-03-11 18:00:06.693
265,26371,,1.0,1619.0,82.750000000000000000,2011,34.250000000000000000,True,20,30,1,2018-06-26 07:30:52.247,2025-03-11 18:00:06.693
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17926,77248,,7.0,34.0,74.250000000000000000,1177,0E-18,False,20,30,0,2021-01-21 08:11:38.343,2021-05-31 07:33:58.080
17927,77248,,7.0,34.0,74.250000000000000000,1837,0E-18,False,20,30,0,2021-01-21 08:11:38.343,2021-05-31 07:33:58.080
17928,77248,,7.0,34.0,74.250000000000000000,1838,0E-18,False,20,30,0,2021-01-21 08:11:38.343,2021-05-31 07:33:58.080
17929,77248,,7.0,34.0,74.250000000000000000,2013,0E-18,False,20,30,0,2021-01-21 08:11:38.343,2021-05-31 07:33:58.080


In [60]:
filtered_df.shape

(5513, 13)

In [61]:
count_values = filtered_df.groupby('recipe_id')['taxonomy_id'].nunique()
count_values.mean()

7.978292329956584

In [62]:
#Create function to generate random input for the menu optimiser in JSON format
def get_randon_input(df):
    num_recipes = 50
    recipes_list = [str(recipe) for recipe in df['recipe_id'].unique()]
    taxonomies = list(np.random.choice(filtered_df['taxonomy_id'].unique(), 2))
    quantities_max = int(num_recipes / 2)
    required_recipes = random.sample(recipes_list, int(num_recipes * 0.1))
    quantity_1 = np.random.randint(int(quantities_max/2),quantities_max + 1)
    quantity_2 = np.random.randint(int(quantities_max/2),quantities_max + 1)
    




    return {"week":47,
            "year":2024,

            "companies":
            [
            {"company_id":"09ecd4f0-ae58-4539-8e8f-9275b1859a19",
            "num_recipes":num_recipes,
            "required_recipes":required_recipes,
            "available_recipes":recipes_list,

            "taxonomies":
                [{"taxonomy_id":taxonomies[0],
                "quantity":quantity_1,
                "taxonomy_type_id":0,
                "min_average_rating":70,

                "main_ingredients":
                                [{"main_ingredient_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[0]]['main_ingredient_id'].unique()),
                                "quantity":np.random.randint(int(quantity_1/2), quantity_1 + 1)}],
                "cooking_times":[
                                {"time_from":20,
                                "time_to":30,
                                "quantity":20}],
                "price_categories":
                                [{"price_category_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[0]]['price_category'].unique()),
                                "quantity":np.random.randint(int(quantity_1/2), quantity_1 + 1)}]},


                {"taxonomy_id":taxonomies[1],
                "quantity":quantity_2,
                "taxonomy_type_id":0,
                "min_average_rating":70,

                "main_ingredients":
                                [{"main_ingredient_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[1]]['main_ingredient_id'].unique()),
                                "quantity":np.random.randint(int(quantity_2/2), quantity_2 + 1)}],
                "cooking_times":[
                                {"time_from":20,
                                "time_to":30,
                                "quantity":20}],
                "price_categories":
                                [{"price_category_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[1]]['price_category'].unique()),
                                "quantity":np.random.randint(int(quantity_2/2), quantity_2 + 1)}]} ]}]}

In [63]:
input_json = get_randon_input(filtered_df)

In [64]:
from optimization import generate_menu_companies_sous_chef

In [65]:
generate_menu_companies_sous_chef(week=input_json["week"], year=input_json["year"], companies=input_json['companies'], input_df=df_recipes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PIM_data_excluded[COOKING_TIME_COLUMN] = PIM_data_excluded.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PIM_data_excluded[COOKING_TIME_COLUMN] = PIM_data_excluded.apply(


{'year': 2024,
 'week': 47,
 'companies': [{'company_id': '09ecd4f0-ae58-4539-8e8f-9275b1859a19',
   'taxonomies': [{'taxonomy_id': 2015,
     'wanted': 15,
     'actual': 15,
     'main_ingredients': [{'main_ingredient_id': 1, 'wanted': 7, 'actual': 8}],
     'price_categories': [{'price_category_id': 1,
       'wanted': 10,
       'actual': 10}],
     'cooking_times': [{'from': 20, 'to': 30, 'wanted': 20, 'actual': 15}]},
    {'taxonomy_id': 953,
     'wanted': 15,
     'actual': 15,
     'main_ingredients': [{'main_ingredient_id': 8,
       'wanted': 14,
       'actual': 1}],
     'price_categories': [{'price_category_id': 1, 'wanted': 13, 'actual': 5}],
     'cooking_times': [{'from': 20, 'to': 30, 'wanted': 20, 'actual': 15}]}],
   'recipes': [{'recipe_id': 78005,
     'main_ingredient_id': 7,
     'is_constraint': True},
    {'recipe_id': 64412, 'main_ingredient_id': 7, 'is_constraint': True},
    {'recipe_id': 96372, 'main_ingredient_id': 5, 'is_constraint': True},
    {'recipe_