## Data Preprocessing

In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the data
recipes = pd.read_csv('data/RAW_recipes.csv')
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [3]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [4]:
recipes.describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients
count,231637.0,231637.0,231637.0,231637.0,231637.0
mean,222014.708984,9398.546,5534885.0,9.765499,9.051153
std,141206.635626,4461963.0,99791410.0,5.995128,3.734796
min,38.0,0.0,27.0,0.0,1.0
25%,99944.0,20.0,56905.0,6.0,6.0
50%,207249.0,40.0,173614.0,9.0,9.0
75%,333816.0,65.0,398275.0,12.0,11.0
max,537716.0,2147484000.0,2002290000.0,145.0,43.0


In [5]:
recipes.isna().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [6]:
# Going to drop nulls as I want description to be able to display to the user
recipes = recipes.dropna()
recipes.isna().sum()

name              0
id                0
minutes           0
contributor_id    0
submitted         0
tags              0
nutrition         0
n_steps           0
steps             0
description       0
ingredients       0
n_ingredients     0
dtype: int64

In [7]:
# Need to look at all the tags, to be used as content features
# Currently a string, need to convert to list
recipes['tags'] = recipes['tags'].apply(ast.literal_eval)
tags = recipes['tags'].explode()
tags

0         60-minutes-or-less
0               time-to-make
0                     course
0            main-ingredient
0                    cuisine
                 ...        
231636               dietary
231636          comfort-food
231636            taste-mood
231636                 sweet
231636    number-of-servings
Name: tags, Length: 4045919, dtype: object

In [8]:
len(tags.unique())

552

In [11]:
tag_counts = tags.value_counts()
tags_filtered = tag_counts[tag_counts >  10000]
tags_filtered

tags
preparation        225568
time-to-make       220353
course             213602
main-ingredient    166456
dietary            160444
                    ...  
beverages           10913
sweet               10503
savory              10428
potluck             10313
potatoes            10052
Name: count, Length: 80, dtype: int64

In [12]:
# Nutrition information (calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV), total carbohydrates (PDV))
# Currently a string, need to convert to list then make into seperate columns
recipes['nutrition'] = recipes['nutrition'].apply(ast.literal_eval)
recipes[['calories', 'fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']] = recipes['nutrition'].apply(pd.Series)
recipes = recipes.drop(columns=['nutrition'])
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,n_ingredients,calories,fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [13]:
# Want to look at how specific steps are, can I use this as a content feature without heavy NLP?
# Currently a string, need to convert to list
recipes['steps'] = recipes['steps'].apply(ast.literal_eval)
recipes['steps'][0]

['make a choice and proceed with recipe',
 'depending on size of squash , cut into half or fourths',
 'remove seeds',
 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece',
 'season with mexican seasoning mix ii',
 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece',
 'season with sweet mexican spice mix',
 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin',
 'be careful not to burn the squash especially if you opt to use sugar or butter',
 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking',
 'if desired , season with salt']

In [14]:
# I want to use inredients as a content feature as well to search for recipes
# Currently a string, need to convert to list
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)
recipes['ingredients'][0]

['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [15]:
# Looking at counts, lots of one off ingredients will limit based on count, same as tags
ingredients = recipes['ingredients'].explode()
ing_counts = ingredients.value_counts()
ing_counts

ingredients
salt                         83781
butter                       53788
sugar                        43419
onion                        38168
water                        34060
                             ...  
low-sodium wheat crackers        1
capicola-mozzarella roll         1
citrus ponzu soy sauce           1
asiago cheese rolls              1
nepitella                        1
Name: count, Length: 14758, dtype: int64

In [16]:
# Limit to ingredients that appear more than 2000 times
ing_filetered = ing_counts[ing_counts > 2000]
ing_filetered

ingredients
salt                83781
butter              53788
sugar               43419
onion               38168
water               34060
                    ...  
beef broth           2045
shallot              2029
lemon, juice of      2019
ground coriander     2012
fresh mushrooms      2009
Name: count, Length: 173, dtype: int64

In [17]:
# Something to consider for future steps, some ingredients are very similar, could be standardised with NLP
# Is this something I actually want to standardise? chicken broth vs. chicken breast?
# Need to consider this during the ingredient search
for ing in list(ing_filetered.index):
    if 'chicken' in ing:
        print(ing)

chicken broth
chicken stock
boneless skinless chicken breasts
chicken breasts
chicken


In [18]:
# Function to check if a row contains a value and return a boolean
# Will help split out ingredients and tags into seperate columns
def check_values(row, values):
    return pd.Series({value: value in row for value in values})

In [19]:
# Create new columns for each ingredient in the filtered list
new_cols_ing = recipes['ingredients'].apply(check_values, values=list(ing_filetered.index))
new_cols_ing

Unnamed: 0,salt,butter,sugar,onion,water,eggs,olive oil,flour,garlic cloves,milk,...,cooking spray,skim milk,cream,chicken,warm water,beef broth,shallot,"lemon, juice of",ground coriander,fresh mushrooms
0,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,False,False,True,True,False,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
231633,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231634,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231635,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
# Creating new columns for each tag in the filtered list
new_cols_tag = recipes['tags'].apply(check_values, values=list(tags_filtered.index))
new_cols_tag

Unnamed: 0,preparation,time-to-make,course,main-ingredient,dietary,easy,occasion,cuisine,low-in-something,main-dish,...,free-of-something,condiments-etc,high-in-something,soups-stews,technique,beverages,sweet,savory,potluck,potatoes
0,True,True,True,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,True,True,True,True,True,True,True,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,True,True,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,True,True,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,True,True,True,True,True,False,True,True,False,False,...,False,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,True,True,True,True,False,True,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
231633,True,True,True,False,True,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
231634,True,True,True,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231635,True,True,True,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [21]:
# Brining together ingredients and tags into one dataframe
tags_ings = pd.concat([new_cols_ing, new_cols_tag], axis=1)
tags_ings.head()

Unnamed: 0,salt,butter,sugar,onion,water,eggs,olive oil,flour,garlic cloves,milk,...,free-of-something,condiments-etc,high-in-something,soups-stews,technique,beverages,sweet,savory,potluck,potatoes
0,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,True,False,True,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False


In [22]:
# Bringing togather the features I want to use in the content based filtering and removing the columns I don't want
recipes_feat = recipes.drop(columns=['contributor_id', 'submitted', 'tags', 'steps', 'description',
                                     'ingredients', 'name', 'n_steps', 'n_ingredients', 'minutes'])
recipes_feat = pd.concat([recipes_feat, tags_ings], axis=1)
recipes_feat.head()

Unnamed: 0,id,calories,fat,sugar,sodium,protein,saturated_fat,carbohydrates,salt,butter,...,free-of-something,condiments-etc,high-in-something,soups-stews,technique,beverages,sweet,savory,potluck,potatoes
0,137739,51.5,0.0,13.0,0.0,2.0,0.0,4.0,True,True,...,False,False,False,False,False,False,False,False,False,False
1,31490,173.4,18.0,0.0,17.0,22.0,35.0,1.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,112140,269.8,22.0,32.0,48.0,39.0,27.0,5.0,True,False,...,False,False,False,False,False,False,False,False,False,False
3,59389,368.1,17.0,10.0,2.0,14.0,8.0,20.0,True,False,...,False,False,False,False,False,False,False,False,False,True
4,44061,352.9,1.0,337.0,23.0,3.0,0.0,28.0,True,False,...,False,True,False,False,True,False,False,False,False,False


In [23]:
# Slight overlap in tags and ingredients, will remove duplicates
duplicate_columns = recipes_feat.columns[recipes_feat.columns.duplicated()].tolist()
duplicate_columns

['sugar', 'chicken', 'cheese', 'potatoes']

In [24]:
# Sanity check for duplicate column names
recipes_feat.columns.value_counts()

cheese              2
sugar               2
chicken             2
potatoes            2
sesame seeds        1
                   ..
ketchup             1
balsamic vinegar    1
buttermilk          1
cilantro            1
potluck             1
Name: count, Length: 257, dtype: int64

In [25]:
# Removing duplicate columns
recipes_feat_clean = recipes_feat.loc[:, ~recipes_feat.columns.duplicated()]
recipes_feat_clean.columns.value_counts()

id                     1
bananas                1
cucumber               1
cold water             1
sesame seeds           1
                      ..
ground black pepper    1
ketchup                1
balsamic vinegar       1
buttermilk             1
potluck                1
Name: count, Length: 257, dtype: int64

In [26]:
# Save feature matrix as parquet file
recipes_feat_clean.to_parquet('data/recipes_feat.parquet')

In [27]:
# For size limitations seperating the content features and the tags and ingredients
recipes_ingtag = recipes.drop(columns=['name', 'steps', 'description', 'contributor_id', 'submitted', 'n_ingredients', 
                                    'n_steps', 'minutes', 'calories', 'fat', 'sugar', 'sodium', 'protein',
                                    'saturated_fat', 'carbohydrates'])
recipes_ingtag.head()

Unnamed: 0,id,tags,ingredients
0,137739,"[60-minutes-or-less, time-to-make, course, mai...","[winter squash, mexican seasoning, mixed spice..."
1,31490,"[30-minutes-or-less, time-to-make, course, mai...","[prepared pizza crust, sausage patty, eggs, mi..."
2,112140,"[time-to-make, course, preparation, main-dish,...","[ground beef, yellow onions, diced tomatoes, t..."
3,59389,"[60-minutes-or-less, time-to-make, course, mai...","[spreadable cheese with garlic and herbs, new ..."
4,44061,"[weeknight, time-to-make, course, main-ingredi...","[tomato juice, apple cider vinegar, sugar, sal..."


In [28]:
# Save as parquet file
recipes_ingtag.to_parquet('data/recipes_ingtag.parquet')

In [29]:
# Create a dataframe with steps and description
recipes_steps = recipes.drop(columns=['contributor_id', 'submitted', 'n_ingredients', 'n_steps', 'minutes', 'calories', 'fat',
                                      'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'tags'])
recipes_steps.head()

Unnamed: 0,name,id,steps,description,ingredients
0,arriba baked winter squash mexican style,137739,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice..."
1,a bit different breakfast pizza,31490,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi..."
2,all in the kitchen chili,112140,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t..."
3,alouette potatoes,59389,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ..."
4,amish tomato ketchup for canning,44061,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal..."


In [30]:
# Save as parquet file
recipes_steps.to_parquet('data/recipes_steps.parquet')

## Testing Functions for Streamlit

In [31]:
recipes_test = recipes_ingtag.copy()
recipes_test.head()

Unnamed: 0,id,tags,ingredients
0,137739,"[60-minutes-or-less, time-to-make, course, mai...","[winter squash, mexican seasoning, mixed spice..."
1,31490,"[30-minutes-or-less, time-to-make, course, mai...","[prepared pizza crust, sausage patty, eggs, mi..."
2,112140,"[time-to-make, course, preparation, main-dish,...","[ground beef, yellow onions, diced tomatoes, t..."
3,59389,"[60-minutes-or-less, time-to-make, course, mai...","[spreadable cheese with garlic and herbs, new ..."
4,44061,"[weeknight, time-to-make, course, main-ingredi...","[tomato juice, apple cider vinegar, sugar, sal..."


In [32]:
# Need a function to match if selected tags are present in the row
# We will prompt user for specific tags, and return recipes that contain all of those tags
def all_tags_present(item_tags, selected):
    return all(string in item_tags for string in selected)

In [33]:
recipes_test['tag_match'] = recipes_test['tags'].apply(all_tags_present, selected=['poultry', '30-minutes-or-less'])
recipes_test = recipes_test[recipes_test['tag_match'] == True]

In [34]:
recipes_test.head()

Unnamed: 0,id,tags,ingredients,tag_match
57,32169,"[30-minutes-or-less, time-to-make, course, mai...","[tomatoes, garlic, onion, button mushrooms, ho...",True
75,42570,"[30-minutes-or-less, time-to-make, course, mai...","[boneless chicken breast, garlic, salt, cumin,...",True
233,219681,"[30-minutes-or-less, time-to-make, course, mai...","[chicken breasts, olive oil, lemon juice, whit...",True
336,399278,"[30-minutes-or-less, time-to-make, course, mai...","[cayenne pepper sauce, vegetable oil, granulat...",True
447,371549,"[30-minutes-or-less, time-to-make, course, mai...","[jalapeno peppers, fresh cilantro, light beer,...",True


In [35]:
# Ingredient selection, will handle input with streamlit
ing_selected = ['chicken', 'lentils']

In [37]:
# Need to define a function that will return True if all ingredients selected are found in the matched row
def check_ingredients_df(row):
    ing_matched = row
    
    # Join all ingredients into a single lowercase string
    ingredients_str = ' '.join(str(ing).lower() for ing in ing_matched)
    
    # Check each item in all_ings
    for item in ing_selected:
        item = item.strip('s')
        if item not in ingredients_str:
            return False
    
    # If we've made it through all items without returning False, return True
    return True


In [38]:
# Can now apply to dataframe
recipes_test['ing_match'] = recipes_test['ingredients'].apply(check_ingredients_df)
recipes_test[recipes_test['ing_match'] == True]

Unnamed: 0,id,tags,ingredients,tag_match,ing_match
16018,204669,"[30-minutes-or-less, time-to-make, course, mai...","[skinless chicken breasts, split peas, red len...",True,True
46564,66735,"[30-minutes-or-less, time-to-make, course, mai...","[olive oil, onion, garlic, red lentils, chicke...",True,True
123030,287439,"[30-minutes-or-less, time-to-make, course, mai...","[boneless skinless chicken breasts, water, tom...",True,True
180614,363669,"[lactose, 30-minutes-or-less, time-to-make, co...","[whole chicken, olive oil, onion, garlic clove...",True,True


In [39]:
# How do we want to display the steps to the user?
recipes_steps.head()

Unnamed: 0,name,id,steps,description,ingredients
0,arriba baked winter squash mexican style,137739,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice..."
1,a bit different breakfast pizza,31490,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi..."
2,all in the kitchen chili,112140,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t..."
3,alouette potatoes,59389,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ..."
4,amish tomato ketchup for canning,44061,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal..."


In [40]:

rec = recipes_steps[recipes_steps['id'] == 66735]
rec['steps'].values[0], rec['name'].values[0]

(['heat the oil in a non-stick saucepan over medium heat and cook the onion and garlic until softened',
  'add the lentils and stock and bring to a boil',
  'cover and cook for 10 minutes , or until lentils are tender',
  'stir through the lemon juice , parsley , salt and pepper',
  'while lentils are cooking , season the chicken with cumin',
  'heat a non-stick frying pan and spray with cooking oil spray',
  'cook chicken until cooked through',
  'serve over lentils',
  'lamb or fish fillets can be exchanged for chicken in this recipe'],
 'chicken with red lentils')

In [41]:
name = rec['name'].values[0].replace(' ', '-')
name

'chicken-with-red-lentils'

In [42]:
# Can use some clever formatted strings to display the link (because they all follow the same format)
link = f"https://www.food.com/recipe/{name}-{rec['id'].values[0]}"
link

'https://www.food.com/recipe/chicken-with-red-lentils-66735'

In [45]:
# We can use the feature matrix to find similar recipes to the one selected
rec_feat = recipes_feat_clean[recipes_feat_clean['id'] == 66735]
rec_feat = rec_feat.drop(columns=['id']).values.reshape(1, -1)

# Using cosine similarity to compare feature vectors
cosine_sim = cosine_similarity(rec_feat, recipes_feat_clean.drop(columns=['id']))
sim_scores = list(zip(recipes_feat_clean['id'].values, cosine_sim[0]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:6]
rec_indices = [i[0] for i in sim_scores]

recs = recipes_steps[recipes_steps['id'].isin(rec_indices)][['id', 'name', 'description']]
recs


Unnamed: 0,id,name,description
44912,292894,chicken noodle goulash abs diet,my husband does the abs diet. this is one of t...
107617,359825,honey mustard cracker crumb chicken,this is a healthier version of fried chicken. ...
150669,161093,oven baked fish and chips,"from family circle, uk. a healthy version of ..."
168905,458924,quick double tomato chicken with artichoke hea...,"entered for safe-keeping, this is almost a ""cu..."
179112,56760,salmon potato boats,another recipe i've had for years from the bac...
