In [53]:
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

In [4]:
def load_processed_recipes_similarity(data_folder_path=os.path.join("data", "processed")):
    """Loads the processed recipes data from the processed data folder.

    Args:
        data_folder_path (str, optional): Path to the data folder containing the processed recipes. Defaults to "data".

    Returns:
        pd.DataFrame: The processed recipes dataframe.
    """
    # Load the processed recipes
    recipes_df_processed = pl.read_csv(os.path.join(data_folder_path, "processed_recipes_for_similarity.csv"))
    recipes_df_processed = recipes_df_processed.to_pandas()
    return recipes_df_processed

def load_processed_recipes_display(data_folder_path=os.path.join("data", "processed")):
    """Loads the processed recipes data from the processed data folder.

    Args:
        data_folder_path (str, optional): Path to the data folder containing the processed recipes. Defaults to "data".

    Returns:
        pd.DataFrame: The processed recipes dataframe.
    """
    # Load the processed recipes
    recipes_df_processed = pl.read_csv(os.path.join(data_folder_path, "processed_recipes_for_display.csv"))
    recipes_df_processed = recipes_df_processed.to_pandas()
    return recipes_df_processed

def load_raw_data(data_folder_path="data"):
    """Loads the interactions and recipes data from the data folder.

    Args:
        data_folder_path (str, optional): path to the folder containing the data. Defaults to "data".

    Returns:
        tuple: tuple containing the interactions and recipe dataframes.
    """
    # Load the interactions and recipes
    interactions_df = load_raw_interactions(data_folder_path=data_folder_path)
    recipes_df = load_raw_recipes(data_folder_path=data_folder_path)
    
    return interactions_df, recipes_df

def load_raw_interactions(data_folder_path="data"):
    """Loads the interactions data from the data folder.

    Args:
        data_folder_path (str, optional): Path to the folder containing the data. Defaults to "data".

    Returns:
        pd.DataFrame: The interactions dataframe.
    """
    # Load the interactions
    interactions_df = pl.read_csv(os.path.join(data_folder_path, "RAW_interactions.csv")) 
    
    # Convert to pandas dataframe
    interactions_df = interactions_df.to_pandas()
    
    # Return the dataframe
    return interactions_df

def load_raw_recipes(data_folder_path="data"):
    """Loads the recipes data from the data folder.

    Args:
        data_folder_path (str, optional): Path to the folder containing the data. Defaults to "data".

    Returns:
        pd.DataFrame: The recipes dataframe.
    """
    # Load the recipes
    recipes_df = pl.read_csv(os.path.join(data_folder_path, "RAW_recipes.csv"))
    
    # Convert to pandas dataframe
    recipes_df = recipes_df.to_pandas()
    
    # Return the dataframe
    return recipes_df

def load_ingredient_mapping_dict(data_folder_path=os.path.join("data", "processed")):
    """Loads the ingredient mapping from the data folder.

    Args:
        data_folder_path (str, optional): Path to the folder containing the data. Defaults to "data/processed".

    Returns:
        dict: The ingredient mapping.
    """
    # Load the ingredient mapping
    ingredient_mapping = pd.read_csv(os.path.join(data_folder_path, "ingredient_mapping.csv")).set_index('ingredient').to_dict()['mapping']
    return ingredient_mapping

In [53]:
interactions_df, recipes_df = load_raw_data(data_folder_path=os.path.join("..", "data"))


In [5]:
temp = interactions_df.groupby("recipe_id").count().sort_values("rating", ascending=False)


In [6]:
top_recipes = temp[temp["rating"] > 500]

In [7]:
display(interactions_df.head())

num_recipes = interactions_df["recipe_id"].nunique()
print(f"There are {num_recipes} unique recipes in the interactions dataset")

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


There are 231637 unique recipes in the interactions dataset


In [8]:
display(recipes_df)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8
...,...,...,...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22
231633,zydeco spice mix,493372,5,1500678,2013-01-09,"['15-minutes-or-less', 'time-to-make', 'course...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13
231634,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8
231635,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10


In [9]:
display(ingredient_mapping[ingredient_mapping["raw_ingr"] == "winter squash"]['id'].values[0])

7933

## Distribution Analysis

### Ingredients

In [118]:
ingredients_df = recipes_df['ingredients'].str.replace('[', '').str.replace(']', '').str.replace("'", "").str.split(',')
# Explode the ingredients column
ingredients_df = ingredients_df.explode()

ingredient_counts = ingredients_df.value_counts()
top_20 = ingredient_counts.iloc[:20]
ingredient_count

In [11]:
x = top_20.index.str.strip()
y = 'salt' in x
y

True

### Tags

In [12]:
num_unique_tags = recipes_df['tags'].nunique()
print(f"There are {num_unique_tags} unique tags in the recipes dataset")
print(f"Number of columns in the recipes dataset: {recipes_df.shape[1]}")

tag_list = recipes_df['tags'].str.replace('[', '').str.replace(']', '').str.replace("'", "").str.split(',')
tag_list = tag_list.explode().str.strip()
tag_list = np.unique(tag_list)

There are 209115 unique tags in the recipes dataset
Number of columns in the recipes dataset: 12


In [13]:
CUISINES = ['italian', 'chinese', 'indian', 'thai', 'american', 'greek', 'spanish', 'german', 'french', 'japanese', 'lebanese', 'korean', 'australian', 'caribbean', 'filipino', 'scottish', 'mexican', 'indonesian', 'brazilian']
for cuisine in CUISINES:
    print(f"Cusiine {cuisine} in tag_list: {cuisine in tag_list}")


Cusiine italian in tag_list: True
Cusiine chinese in tag_list: True
Cusiine indian in tag_list: True
Cusiine thai in tag_list: True
Cusiine american in tag_list: True
Cusiine greek in tag_list: True
Cusiine spanish in tag_list: True
Cusiine german in tag_list: True
Cusiine french in tag_list: True
Cusiine japanese in tag_list: True
Cusiine lebanese in tag_list: True
Cusiine korean in tag_list: True
Cusiine australian in tag_list: True
Cusiine caribbean in tag_list: True
Cusiine filipino in tag_list: True
Cusiine scottish in tag_list: True
Cusiine mexican in tag_list: True
Cusiine indonesian in tag_list: True
Cusiine brazilian in tag_list: True


## Preprocessing

In [152]:
def onehot_encode_ingredients(df, keep_top_n=20):
    df_processed = df.copy()
    columns_minus_ingredients = list(df_processed.columns.drop('ingredients'))

    # Get the top n ingredients
    df_processed = df_processed.explode(column='ingredients')
    df_processed['ingredients'] = df_processed['ingredients'].str.strip()
    ingredient_counts = df_processed['ingredients'].value_counts()
    top_n = ingredient_counts.iloc[:keep_top_n]
    
    # Seperate the recipes withand without ingredients in top_n
    #print(top_n)
    #display(df_processed)
    
    df_included = df_processed[df_processed['ingredients'].isin(top_n.index)]
    df_included = df_included.groupby(columns_minus_ingredients).agg({'ingredients': lambda x: list(x)}).reset_index()
    
    df_excluded = df_processed[~df_processed['ingredients'].isin(top_n.index)]
    df_excluded = df_excluded.groupby(columns_minus_ingredients).agg({'ingredients': lambda x: list(x)}).reset_index()
    
    # One-hot encode the ingredients
    df_included = df_included.join(pd.DataFrame(mlb.fit_transform(df_included.pop('ingredients')),
                          columns=mlb.classes_,
                          index=df_included.index))
    
    # Combine the excluded recipes back in
    df_processed = df_included.merge(df_excluded, on=columns_minus_ingredients, how='outer').fillna(0)
    df_processed = df_processed.drop(columns=['ingredients'])
    
    # Return the results
    return df_processed

def onehot_encode_cuisines(df):
    CUISINES = {'italian', 'chinese', 'indian', 'thai', 'american', 'greek', 'spanish', 'german', 'french', 'japanese', 'lebanese', 'korean', 'australian', 'caribbean', 'filipino', 'scottish', 'mexican', 'indonesian', 'brazilian', 'south-african'}
    df_processed = df.copy()
    columns_minus_tags = list(df_processed.columns.drop('tags'))
    
    # Get all the tags
    df_processed = df_processed.explode(column='tags')
    df_processed['tags'] = df_processed['tags'].str.strip()
    
    # Drop tags not in the cuisine types
    df_included = df_processed[df_processed['tags'].isin(CUISINES)]
    df_included = df_included.groupby(columns_minus_tags).agg({'tags': lambda x: list(x)}).reset_index()
    
    df_excluded = df_processed[~df_processed['tags'].isin(CUISINES)]
    df_excluded = df_excluded.groupby(columns_minus_tags).agg({'tags': lambda x: list(x)}).reset_index()
    
    # One-hot encode the tags
    df_included = df_included.join(pd.DataFrame(mlb.fit_transform(df_included.pop('tags')),
                          columns=mlb.classes_,
                          index=df_included.index))
    # Combine the excluded recipes back in
    df_processed = df_included.merge(df_excluded, on=columns_minus_tags, how='outer').fillna(0)
    df_processed = df_processed.drop(columns=['tags'])                                     
    
    # Return the results
    return df_processed
    

In [155]:
# Get the appropriate columns from the recipes dataframe
recipes_df_processed = recipes_df[['id', 'minutes', 'tags', 'nutrition', 'n_steps', 'ingredients']].copy()
mlb = MultiLabelBinarizer()

# Expand the nutrition column into 7 columns
new_columns = ['calories', 'fat_amount', 'sodium_amount', 'protein_amount', 'sugar_amount', 'saturated_fat', 'carbohydrates']
recipes_df_processed['nutrition'] = recipes_df_processed['nutrition'].str.replace('[', '').str.replace(']', '')
recipes_df_processed[new_columns] = recipes_df_processed['nutrition'].str.split(',', expand=True).astype(float)
recipes_df_processed = recipes_df_processed.drop(columns=['nutrition'])

# Explode the tags column and one hot encode them
recipes_df_processed["tags"] = recipes_df_processed["tags"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(","))
recipes_df_processed = onehot_encode_cuisines(recipes_df_processed)
#recipes_df_processed = tags_to_cuisines(recipes_df_processed)
#recipes_df_processed = recipes_df_processed.join(pd.DataFrame(mlb.fit_transform(recipes_df_processed.pop('tags')),
#                          columns=mlb.classes_,
#                          index=recipes_df_processed.index))

# Explode the ingredients column and one hot encode them
recipes_df_processed["ingredients"] = recipes_df_processed["ingredients"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(","))
recipes_df_processed = onehot_encode_ingredients(recipes_df_processed, keep_top_n=100)
#recipes_df_processed = remove_sparse_ingredients(recipes_df_processed, keep_top_n=2)
#recipes_df_processed = recipes_df_processed.join(pd.DataFrame(mlb.fit_transform(recipes_df_processed.pop('ingredients')),
#                          columns=mlb.classes_,
#                          index=recipes_df_processed.index))


In [157]:
recipes_df_processed
recipes_df_processed.to_csv(os.path.join("..", "data", "processed", "processed_recipes.csv"), index=False)

## Get top 50 Recipes

In [174]:
def get_top_recipes(interactions_df, recipes_df, num_recipes=25):
    # Get the top recipe_ids
    top_recipes = interactions_df.groupby("recipe_id").count().sort_values("rating", ascending=False).reset_index()
    recipe_ids = top_recipes[['recipe_id']].head(num_recipes)

    # Combine with the interactions_df
    combined_df = pd.merge(recipe_ids, recipes_df, left_on='recipe_id', right_on='id').drop(columns=['id'])
    return combined_df

In [176]:
get_top_recipes(interactions_df, recipes_df)

Unnamed: 0,recipe_id,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,2886,best banana bread,65,1762,1999-09-26,"['time-to-make', 'course', 'main-ingredient', ...","[272.8, 16.0, 97.0, 14.0, 7.0, 31.0, 14.0]",13,"['remove odd pots and pans from oven', 'prehea...",you'll never need another banana bread recipe ...,"['butter', 'granulated sugar', 'eggs', 'banana...",8
1,27208,to die for crock pot roast,545,28201,2002-05-03,"['weeknight', 'time-to-make', 'course', 'main-...","[295.2, 14.0, 0.0, 15.0, 99.0, 19.0, 0.0]",7,"['place beef roast in crock pot', 'mix the dri...","amazing flavor, and so simple! no salt needed ...","['beef roast', 'brown gravy mix', 'dried itali...",5
2,89204,crock pot chicken with black beans cream cheese,243,137839,2004-04-16,"['time-to-make', 'course', 'main-ingredient', ...","[679.2, 53.0, 39.0, 38.0, 91.0, 75.0, 16.0]",5,"['take 4-5 frozen , yes , frozen , boneless ch...",i love this crock-pot chicken recipe for two r...,"['boneless chicken breasts', 'black beans', 'c...",5
3,39087,creamy cajun chicken pasta,25,30534,2002-09-02,"['30-minutes-or-less', 'time-to-make', 'course...","[719.1, 63.0, 12.0, 34.0, 78.0, 120.0, 15.0]",4,['place chicken and cajun seasoning in a bowl ...,n'awlin's style of chicken with an updated alf...,"['boneless skinless chicken breast halves', 'l...",12
4,67256,best ever banana cake with cream cheese frosting,75,82367,2003-07-24,"['weeknight', 'time-to-make', 'course', 'main-...","[503.5, 31.0, 222.0, 15.0, 11.0, 61.0, 25.0]",18,"['preheat oven to 275f', 'grease and flour a 9...",this is one of (if not) the best banana cake i...,"['bananas', 'lemon juice', 'flour', 'baking so...",13
5,54257,yes virginia there is a great meatloaf,80,24670,2003-02-17,"['time-to-make', 'course', 'main-ingredient', ...","[493.1, 43.0, 75.0, 52.0, 70.0, 55.0, 7.0]",15,['meatloaf: combine meat loaf ingredients and ...,absolutely delicious meatloaf and sauce! those...,"['ground beef', 'bread', 'egg', 'vidalia onion...",10
6,22782,jo mama s world famous spaghetti,80,25455,2002-03-17,"['weeknight', 'time-to-make', 'course', 'main-...","[555.9, 40.0, 45.0, 85.0, 59.0, 45.0, 16.0]",10,"['in large , heavy stockpot , brown italian sa...",my kids will give up a steak dinner for this s...,"['italian sausage', 'onion', 'garlic cloves', ...",16
7,32204,whatever floats your boat brownies,35,37305,2002-06-25,"['60-minutes-or-less', 'time-to-make', 'course...","[390.7, 30.0, 161.0, 7.0, 12.0, 50.0, 17.0]",14,"['preheat oven to 350f', 'grease an 8 inch squ...","these are absolutely the chewiest, moistest, f...","['butter', 'unsweetened cocoa', 'sugar', 'eggs...",14
8,69173,kittencal s italian melt in your mouth meatballs,50,89831,2003-08-20,"['60-minutes-or-less', 'time-to-make', 'course...","[1312.6, 129.0, 8.0, 108.0, 214.0, 174.0, 8.0]",5,['mix all ingredients together in a large bowl...,cooking the meatballs in simmering pasta sauce...,"['ground beef', 'egg', 'parmesan cheese', 'bre...",10
9,68955,japanese mum s chicken,45,29196,2003-08-13,"['60-minutes-or-less', 'time-to-make', 'course...","[313.1, 19.0, 54.0, 61.0, 62.0, 17.0, 5.0]",5,['place all the ingredients in a saucepan over...,"we have a japanese girl, tomoko, living with u...","['chicken drumsticks', 'water', 'balsamic vine...",7


In [10]:
import sys
sys.path.append("..")
from scripts.data_processing import *
CUISINES = {'italian', 'chinese', 'indian', 'thai', 'american', 'greek', 'spanish', 'german', 'french', 'japanese', 'lebanese', 'korean', 'australian', 'caribbean', 'filipino', 'scottish', 'mexican', 'indonesian', 'brazilian', 'south-african'}

In [83]:
print("Loading the recipes")
recipes_df = load_raw_recipes(data_folder_path=os.path.join("..", "data"))
ingredient_mapping_dict = load_ingredient_mapping_dict(data_folder_path=os.path.join("..", "data", "processed"))

print("Processing the recipes")
# Instantiate the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Get the appropriate columns from the recipes dataframes and do some renaming
recipes_df_processed = recipes_df[['name', 'id', 'minutes', 'tags', 'nutrition', 'n_steps', 'ingredients']]
recipes_df_processed = recipes_df_processed.rename(columns={'tags': 'cuisine'})

# Expand the nutrition column into 7 columns
nutrition_columns = ['calories', 'fat_amount', 'sodium_amount', 'protein_amount', 'sugar_amount', 'saturated_fat', 'carbohydrates']
recipes_df_processed['nutrition'] = recipes_df_processed['nutrition'].str.replace('[', '').str.replace(']', '')
recipes_df_processed[nutrition_columns] = recipes_df_processed['nutrition'].str.split(',', expand=True).astype(float)
recipes_df_processed = recipes_df_processed.drop(columns=['nutrition'])

# Convert the tags column to a cuisine column
columns_minus_cuisine = list(recipes_df_processed.columns.drop('cuisine'))
recipes_df_processed["cuisine"] = recipes_df_processed["cuisine"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(","))

# Get all the tags
recipes_df_processed = recipes_df_processed.explode(column='cuisine')
recipes_df_processed['cuisine'] = recipes_df_processed['cuisine'].str.strip()

# Drop tags not in the cuisine types and recombine the cuisine columns
df_included = recipes_df_processed[recipes_df_processed['cuisine'].isin(CUISINES)]
df_included = df_included.groupby(columns_minus_cuisine).agg({'cuisine': lambda x: list(x)}).reset_index()
df_excluded = recipes_df_processed[~recipes_df_processed['cuisine'].isin(CUISINES)]
df_excluded = df_excluded.groupby(columns_minus_cuisine).agg({'cuisine': lambda x: list(x)}).reset_index()
recipes_df_processed = df_included.merge(df_excluded, on=columns_minus_cuisine, how='outer').fillna("-")
recipes_df_processed = recipes_df_processed.drop(columns=['cuisine_y']).rename(columns={'cuisine_x': 'cuisine'})

# Replace the ingredients using thr mapping
recipes_df_processed["ingredients"] = recipes_df_processed["ingredients"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(","))
recipes_df_processed["ingredients"] = recipes_df_processed["ingredients"].apply(lambda x: [ingredient_mapping_dict[ingr] for ingr in x])
recipes_df_processed_display = recipes_df_processed.copy()

# Onehot encode the cuisine columns
print("Onehot encoding cuisine and ingredients")
recipes_df_processed = recipes_df_processed.join(pd.DataFrame(mlb.fit_transform(recipes_df_processed.pop('cuisine')),
                          columns=mlb.classes_,
                          index=recipes_df_processed.index))

# Onehot encode the ingredients columns
columns_minus_ingredients = list(recipes_df_processed.columns.drop('ingredients'))
recipes_df_processed = recipes_df_processed.explode(column='ingredients')

# Only keep ingredients if they occur often enough
recipes_df_processed['ingredients'] = recipes_df_processed['ingredients'].str.strip()
ingredient_counts = recipes_df_processed['ingredients'].value_counts()
min_number_occurences = 100
top_ingredients = ingredient_counts[ingredient_counts >= min_number_occurences]

# Seperate the recipes with and without ingredients in top_ingredients
df_included = recipes_df_processed[recipes_df_processed['ingredients'].isin(top_ingredients.index)]
df_included = df_included.groupby(columns_minus_ingredients).agg({'ingredients': lambda x: list(x)}).reset_index()

df_excluded = recipes_df_processed[~recipes_df_processed['ingredients'].isin(top_ingredients.index)]
df_excluded = df_excluded.groupby(columns_minus_ingredients).agg({'ingredients': lambda x: list(x)}).reset_index()

# One-hot encode the ingredients
df_included = df_included.join(pd.DataFrame(mlb.fit_transform(df_included.pop('ingredients')),
                        columns=mlb.classes_,
                        index=df_included.index))

# Combine the excluded recipes back in
recipes_df_processed = df_included.merge(df_excluded, on=columns_minus_ingredients, how='outer').fillna(0)
recipes_df_processed = recipes_df_processed.drop(columns=['ingredients'])


print("Saving the results")
#recipes_df_processed_display.to_csv(os.path.join(processed_data_folder_path, "processed_recipes_for_display.csv"), index=False)
#recipes_df_processed.to_csv(os.path.join(processed_data_folder_path, "processed_recipes_for_similarity.csv"), index=False)

# Drop the name column for the similarity matrix and normalize the data
recipes_df_processed = recipes_df_processed.drop(columns=['name'])
nutrition_columns = ['calories', 'fat_amount', 'sodium_amount', 'protein_amount', 'sugar_amount', 'saturated_fat', 'carbohydrates']
columns_to_normalize = ['minutes', 'n_steps'] + nutrition_columns
recipes_df_processed[columns_to_normalize] = StandardScaler().fit_transform(recipes_df_processed[columns_to_normalize])

display(recipes_df_processed_display.head())
display(recipes_df_processed)

Loading the recipes
Processing the recipes
Onehot encoding cuisine and ingredients
Saving the results


Unnamed: 0,name,id,minutes,n_steps,ingredients,calories,fat_amount,sodium_amount,protein_amount,sugar_amount,saturated_fat,carbohydrates,cuisine
0,1 00 tangy chicken recipe,42508,280,8,"[chicken meat, cider vinegar, sauce, chili pow...",969.6,93.0,55.0,83.0,148.0,86.0,7.0,[american]
1,1 000 artichoke hearts,24514,25,6,"[artichoke heart, egg, breadcrumb, oil, butter...",422.5,25.0,20.0,46.0,34.0,43.0,19.0,[american]
2,1 2 3 4 5 chinese spareribs,56719,35,5,"[sparerib, cooking wine, sugar, vinegar, soy s...",490.2,52.0,26.0,46.0,69.0,63.0,2.0,[chinese]
3,1 2 3 4 cake with caramel icing,80411,50,23,"[purpose flour, baking powder, butter, sugar, ...",635.4,39.0,283.0,5.0,12.0,78.0,32.0,[american]
4,1 2 3 granola,13805,35,7,"[oat, wheat germ, sunflower seed, walnut, hone...",355.0,25.0,95.0,0.0,16.0,10.0,15.0,[american]


Unnamed: 0,id,minutes,n_steps,calories,fat_amount,sodium_amount,protein_amount,sugar_amount,saturated_fat,carbohydrates,...,wine vinegar,wonton wrapper,xanthan gum,yam,yeast,yogurt,zest,zinfandel wine,ziti pasta,zucchini
0,283618,-0.002096,0.873124,-0.075008,-0.463765,-0.105360,0.302000,2.126096,-0.464077,-0.116840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,176996,-0.002096,-0.628097,-0.349865,-0.463765,-0.056615,-0.228458,-0.558930,-0.464077,-0.141283,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,480558,-0.002078,-0.794899,-0.142756,-0.296664,0.045875,-0.205724,-0.370807,-0.372460,0.042036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,391705,-0.002094,-0.794899,-0.375838,-0.463765,-0.091611,-0.190568,-0.558930,-0.464077,-0.177947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,416980,-0.002038,-1.295306,-0.364154,-0.463765,-0.084112,-0.213302,-0.524726,-0.464077,-0.165725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231631,298380,-0.002104,-0.794899,-0.055255,-0.360934,-0.105360,0.536917,2.262912,-0.433538,-0.190168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231632,229061,-0.002105,-0.961701,-0.269425,-0.425203,-0.016619,-0.190568,-0.507624,-0.453897,-0.067955,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231633,19391,-0.002106,-0.794899,0.416457,-0.463765,-0.050366,-0.228458,-0.593134,-0.464077,-0.153504,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231634,53234,-0.002079,1.373531,0.237001,-0.116710,-0.105360,-0.228458,4.400671,-0.250305,-0.190168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
recipes_df_processed['wine vinegar'].value_counts()

wine vinegar
0.0    226581
1.0      5055
Name: count, dtype: int64