### Note: prior to running the notebook, ensure your kernel is correctly selected and preferrably pointing to the venv associated with menu planning project

In [1]:
import random
import numpy as np
import pandas as pd
import sys

sys.path.append("../menu_optimiser")

#load correct ENV 
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [2]:
#Attempting import of data from datalake using data_contracts internal package
from data_contracts.sources import data_science_data_lake


df_recipes = await(data_science_data_lake.
                directory("test-folder/MP20/").
                parquet_at("PIM_RecipeBank_GL_QA").
                to_pandas())

In [3]:
type(df_recipes)

pandas.core.frame.DataFrame

In [4]:
df_recipes.shape

(72510, 13)

In [5]:
df_recipes.dtypes

RecipeId                     int32
MainRecipeId               float64
MainIngredientId           float64
NumberOfRatings            float64
AverageRating               object
TaxonomyId                   int32
Price                       object
IsUniverse                    bool
CookingTimeFrom              int32
CookingTimeTo                int32
PriceCategory                int32
CreatedAt           datetime64[ns]
UpdatedAt           datetime64[ns]
dtype: object

In [6]:
df_recipes.head(5)

Unnamed: 0,RecipeId,MainRecipeId,MainIngredientId,NumberOfRatings,AverageRating,TaxonomyId,Price,IsUniverse,CookingTimeFrom,CookingTimeTo,PriceCategory,CreatedAt,UpdatedAt
0,87,,2.0,,0.0,4,0.0,False,20,25,0,2018-02-20 15:00:36.690,NaT
1,114,,2.0,,0.0,4,0.0,False,30,35,3,2018-02-20 15:00:36.690,NaT
2,196,,2.0,,0.0,4,0.0,False,20,25,1,2018-02-20 15:00:36.690,NaT
3,788,,2.0,103.0,75.75,4,0.0,False,25,30,0,2018-02-20 15:00:36.690,NaT
4,864,,5.0,,0.0,4,0.0,False,35,40,0,2018-02-20 15:00:36.690,NaT


In [7]:
#Making a function for this as is needed in the code of menu optimiser

def camel_to_snake(s):
    return ''.join(['_'+c.lower() if c.isupper() else c for c in s]).lstrip('_')

In [8]:
df_recipes.columns = pd.Index([camel_to_snake(x) for x in df_recipes.columns.to_list()])
df_recipes.sample(3)

Unnamed: 0,recipe_id,main_recipe_id,main_ingredient_id,number_of_ratings,average_rating,taxonomy_id,price,is_universe,cooking_time_from,cooking_time_to,price_category,created_at,updated_at
67747,64426,,5.0,395.0,81.0,1837,0.0,True,20,30,0,2018-06-04 16:08:10.377,2024-06-26 06:29:15.560
46388,22404,,1.0,3.0,83.25,986,32.83,False,20,30,2,2020-02-03 15:01:22.897,2020-05-06 08:40:03.137
59371,14682,,7.0,10.0,75.0,1025,30.13,False,20,30,1,2019-05-23 12:48:32.000,2021-05-05 18:57:13.447


In [9]:

# Assuming df is your DataFrame with columns 'a' and 'b' where a=RecipeId and b=TaxonomyId


# Calculate the count of unique values in column 'b' for each unique value in column 'a'
#In other words, we are coutning the unique taxonomies per unique recipe, followed by filtering for recipes that have >= 7 taxonomies

count_values = df_recipes.groupby('recipe_id')['taxonomy_id'].nunique()

# Specify the threshold
threshold = 7  # Adjust this threshold as needed

# Filter 'a' values based on the threshold
valid_a_values = count_values[count_values >= threshold].index.tolist()

# Filter the original DataFrame based on the valid 'a' values
filtered_df = df_recipes[df_recipes['recipe_id'].isin(valid_a_values)]

filtered_df

Unnamed: 0,recipe_id,main_recipe_id,main_ingredient_id,number_of_ratings,average_rating,taxonomy_id,price,is_universe,cooking_time_from,cooking_time_to,price_category,created_at,updated_at
8,892,,2.0,73.0,77.000000000000000000,4,0E-18,False,35,35,-3,2018-02-20 15:00:36.690,NaT
19,5113,,5.0,18.0,83.250000000000000000,4,27.440000000000000000,False,20,30,0,NaT,2018-05-22 09:12:33.760
21,6746,,2.0,,0E-18,4,25.900000000000000000,False,25,30,0,2018-08-06 13:22:16.743,2018-08-20 12:43:05.113
145,6598,,5.0,20.0,72.500000000000000000,5,26.280000000000000000,False,15,15,0,2018-07-31 09:32:00.867,2018-11-11 19:19:55.497
155,7216,,5.0,15.0,78.250000000000000000,5,28.110000000000000000,False,15,15,0,2018-08-31 11:43:34.493,2018-10-31 17:37:30.243
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72500,86168,,5.0,25.0,90.000000000000000000,2145,0E-18,True,20,30,0,2024-04-17 15:08:07.917,2024-06-26 06:29:15.560
72502,87083,,3.0,,0E-18,2145,0E-18,True,20,30,-2,2024-05-02 13:48:26.933,2024-07-01 08:22:03.973
72503,87149,,5.0,,0E-18,2145,0E-18,True,25,35,-1,2024-05-03 12:45:49.330,2024-07-01 08:28:46.667
72504,87153,,1.0,,0E-18,2145,0E-18,True,20,30,-1,2024-05-06 09:33:37.937,2024-07-01 08:25:56.547


In [10]:
count_values = filtered_df.groupby('recipe_id')['taxonomy_id'].nunique()
count_values.mean()

8.998070243149364

In [11]:
#Create function to get random input for the menu optimiser
def get_randon_input(df):
    num_recipes = 50
    recipes_list = [str(recipe) for recipe in df['recipe_id'].unique()]
    taxonomies = list(np.random.choice(filtered_df['taxonomy_id'].unique(), 2))
    quantities_max = int(num_recipes / 2)
    required_recipes = random.sample(recipes_list, int(num_recipes * 0.1))
    quantity_1 = np.random.randint(int(quantities_max/2),quantities_max + 1)
    quantity_2 = np.random.randint(int(quantities_max/2),quantities_max + 1)
    




    return {"week":47,
            "year":2024,

            "companies":
            [
            {"company_id":"09ecd4f0-ae58-4539-8e8f-9275b1859a19",
            "num_recipes":num_recipes,
            "required_recipes":required_recipes,
            "available_recipes":recipes_list,

            "taxonomies":
                [{"taxonomy_id":taxonomies[0],
                "quantity":quantity_1,
                "taxonomy_type_id":0,
                "min_average_rating":70,

                "main_ingredients":
                                [{"main_ingredient_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[0]]['main_ingredient_id'].unique()),
                                "quantity":np.random.randint(int(quantity_1/2), quantity_1 + 1)}],
                "cooking_times":[
                                {"time_from":20,
                                "time_to":30,
                                "quantity":20}],
                "price_categories":
                                [{"price_category_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[0]]['price_category'].unique()),
                                "quantity":np.random.randint(int(quantity_1/2), quantity_1 + 1)}]},


                {"taxonomy_id":taxonomies[1],
                "quantity":quantity_2,
                "taxonomy_type_id":0,
                "min_average_rating":70,

                "main_ingredients":
                                [{"main_ingredient_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[1]]['main_ingredient_id'].unique()),
                                "quantity":np.random.randint(int(quantity_2/2), quantity_2 + 1)}],
                "cooking_times":[
                                {"time_from":20,
                                "time_to":30,
                                "quantity":20}],
                "price_categories":
                                [{"price_category_id": np.random.choice(filtered_df[filtered_df['taxonomy_id'] == taxonomies[1]]['price_category'].unique()),
                                "quantity":np.random.randint(int(quantity_2/2), quantity_2 + 1)}]} ]}]}

In [12]:
input_json = get_randon_input(filtered_df)

In [13]:
from optimization import generate_menu_companies_sous_chef

In [14]:
generate_menu_companies_sous_chef(week=input_json["week"], year=input_json["year"], companies=input_json['companies'], input_df=df_recipes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PIM_data_excluded[COOKING_TIME_COLUMN] = PIM_data_excluded.apply(
  out = out.applymap(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PIM_data_excluded[COOKING_TIME_COLUMN] = PIM_data_excluded.apply(
  out = out.applymap(


{'year': 2024,
 'week': 47,
 'companies': [{'company_id': '09ecd4f0-ae58-4539-8e8f-9275b1859a19',
   'taxonomies': [{'taxonomy_id': 1011,
     'wanted': 13,
     'actual': 13,
     'main_ingredients': [{'main_ingredient_id': 3, 'wanted': 7, 'actual': 7}],
     'price_categories': [{'price_category_id': 0,
       'wanted': 11,
       'actual': 11}],
     'cooking_times': [{'from': 20, 'to': 30, 'wanted': 20, 'actual': 12}]},
    {'taxonomy_id': 62,
     'wanted': 17,
     'actual': 14,
     'main_ingredients': [{'main_ingredient_id': 5,
       'wanted': 14,
       'actual': 14}],
     'price_categories': [{'price_category_id': 1, 'wanted': 10, 'actual': 5}],
     'cooking_times': [{'from': 20, 'to': 30, 'wanted': 20, 'actual': 1}]}],
   'recipes': [{'recipe_id': 15847,
     'main_ingredient_id': 2,
     'is_constraint': True},
    {'recipe_id': 9698, 'main_ingredient_id': 1, 'is_constraint': True},
    {'recipe_id': 10326, 'main_ingredient_id': 1, 'is_constraint': True},
    {'recipe_id