In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the data
recipes = pd.read_csv('data/RAW_recipes.csv')
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [3]:
# Nutrition information (calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV), total carbohydrates (PDV))

In [4]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [5]:
recipes.describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients
count,231637.0,231637.0,231637.0,231637.0,231637.0
mean,222014.708984,9398.546,5534885.0,9.765499,9.051153
std,141206.635626,4461963.0,99791410.0,5.995128,3.734796
min,38.0,0.0,27.0,0.0,1.0
25%,99944.0,20.0,56905.0,6.0,6.0
50%,207249.0,40.0,173614.0,9.0,9.0
75%,333816.0,65.0,398275.0,12.0,11.0
max,537716.0,2147484000.0,2002290000.0,145.0,43.0


In [6]:
recipes.isna().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [7]:
# Going to drop nulls as I want description to be able to display to the user
recipes = recipes.dropna()
recipes.isna().sum()

name              0
id                0
minutes           0
contributor_id    0
submitted         0
tags              0
nutrition         0
n_steps           0
steps             0
description       0
ingredients       0
n_ingredients     0
dtype: int64

In [8]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226657 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            226657 non-null  object
 1   id              226657 non-null  int64 
 2   minutes         226657 non-null  int64 
 3   contributor_id  226657 non-null  int64 
 4   submitted       226657 non-null  object
 5   tags            226657 non-null  object
 6   nutrition       226657 non-null  object
 7   n_steps         226657 non-null  int64 
 8   steps           226657 non-null  object
 9   description     226657 non-null  object
 10  ingredients     226657 non-null  object
 11  n_ingredients   226657 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 22.5+ MB


In [9]:
# Need to look at all the tags, to be used as content features
recipes['tags'] = recipes['tags'].apply(ast.literal_eval)
tags = recipes['tags'].explode()
tags

0         60-minutes-or-less
0               time-to-make
0                     course
0            main-ingredient
0                    cuisine
                 ...        
231636               dietary
231636          comfort-food
231636            taste-mood
231636                 sweet
231636    number-of-servings
Name: tags, Length: 4045919, dtype: object

In [10]:
len(tags)

4045919

In [11]:
tag_counts = tags.value_counts()
tags_filtered = tag_counts[tag_counts >  1000]
tags_filtered

tags
preparation        225568
time-to-make       220353
course             213602
main-ingredient    166456
dietary            160444
                    ...  
white-rice           1038
whole-chicken        1034
steak                1018
chowders             1011
wedding              1008
Name: count, Length: 255, dtype: int64

In [12]:
len(tags.unique())

552

In [13]:
# Nutrition information (calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV), total carbohydrates (PDV))
recipes['nutrition'] = recipes['nutrition'].apply(ast.literal_eval)
recipes[['calories', 'fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']] = recipes['nutrition'].apply(pd.Series)
recipes = recipes.drop(columns=['nutrition'])
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,n_ingredients,calories,fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [14]:
recipes['steps'] = recipes['steps'].apply(ast.literal_eval)
recipes['steps'][0]

['make a choice and proceed with recipe',
 'depending on size of squash , cut into half or fourths',
 'remove seeds',
 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece',
 'season with mexican seasoning mix ii',
 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece',
 'season with sweet mexican spice mix',
 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin',
 'be careful not to burn the squash especially if you opt to use sugar or butter',
 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking',
 'if desired , season with salt']

In [15]:
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)
recipes['ingredients'][0]

['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [16]:
ingredients = recipes['ingredients'].explode()
ing_counts = ingredients.value_counts()
ing_counts

ingredients
salt                         83781
butter                       53788
sugar                        43419
onion                        38168
water                        34060
                             ...  
low-sodium wheat crackers        1
capicola-mozzarella roll         1
citrus ponzu soy sauce           1
asiago cheese rolls              1
nepitella                        1
Name: count, Length: 14758, dtype: int64

In [17]:
ing_filetered = ing_counts[ing_counts > 1000]
ing_filetered

ingredients
salt              83781
butter            53788
sugar             43419
onion             38168
water             34060
                  ...  
cinnamon stick     1011
french bread       1010
asparagus          1006
kidney beans       1004
salmon fillets     1003
Name: count, Length: 311, dtype: int64

In [18]:
for ing in list(ing_filetered.index):
    if 'chicken' in ing:
        print(ing)

chicken broth
chicken stock
boneless skinless chicken breasts
chicken breasts
chicken
cooked chicken
boneless skinless chicken breast halves
cream of chicken soup
boneless skinless chicken breast
low sodium chicken broth


In [19]:
def check_values(row, values):
    return pd.Series({value: value in row for value in values})

In [20]:
new_cols_ing = recipes['ingredients'].apply(check_values, values=list(ing_filetered.index))
new_cols_ing

Unnamed: 0,salt,butter,sugar,onion,water,eggs,olive oil,flour,garlic cloves,milk,...,pasta,cauliflower,coriander,strawberry,pork tenderloin,cinnamon stick,french bread,asparagus,kidney beans,salmon fillets
0,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,True,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,False,False,True,True,False,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
231633,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231634,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231635,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
new_cols_tag = recipes['tags'].apply(check_values, values=list(tags_filtered.index))
new_cols_tag

Unnamed: 0,preparation,time-to-make,course,main-ingredient,dietary,easy,occasion,cuisine,low-in-something,main-dish,...,granola-and-porridge,penne,deep-fry,steaks,spanish,white-rice,whole-chicken,steak,chowders,wedding
0,True,True,True,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,True,True,True,True,True,True,True,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,True,True,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,True,True,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,True,True,True,True,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,True,True,True,True,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
231633,True,True,True,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231634,True,True,True,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
231635,True,True,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
tags_ings = pd.concat([new_cols_ing, new_cols_tag], axis=1)
tags_ings.head()

Unnamed: 0,salt,butter,sugar,onion,water,eggs,olive oil,flour,garlic cloves,milk,...,granola-and-porridge,penne,deep-fry,steaks,spanish,white-rice,whole-chicken,steak,chowders,wedding
0,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,n_ingredients,calories,fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [24]:
recipes_feat = recipes.drop(columns=['contributor_id', 'submitted', 'tags', 'steps', 'description', 'ingredients'])
recipes_feat = pd.concat([recipes_feat, tags_ings], axis=1)
recipes_feat.head()

Unnamed: 0,name,id,minutes,n_steps,n_ingredients,calories,fat,sugar,sodium,protein,...,granola-and-porridge,penne,deep-fry,steaks,spanish,white-rice,whole-chicken,steak,chowders,wedding
0,arriba baked winter squash mexican style,137739,55,11,7,51.5,0.0,13.0,0.0,2.0,...,False,False,False,False,False,False,False,False,False,False
1,a bit different breakfast pizza,31490,30,9,6,173.4,18.0,0.0,17.0,22.0,...,False,False,False,False,False,False,False,False,False,False
2,all in the kitchen chili,112140,130,6,13,269.8,22.0,32.0,48.0,39.0,...,False,False,False,False,False,False,False,False,False,False
3,alouette potatoes,59389,45,11,11,368.1,17.0,10.0,2.0,14.0,...,False,False,False,False,False,False,False,False,False,False
4,amish tomato ketchup for canning,44061,190,5,8,352.9,1.0,337.0,23.0,3.0,...,False,False,False,False,False,False,False,False,False,False


In [25]:
recipes_feat['name']

0           arriba   baked winter squash mexican style
1                     a bit different  breakfast pizza
2                            all in the kitchen  chili
3                                   alouette  potatoes
4                   amish  tomato ketchup  for canning
                              ...                     
231632                                     zydeco soup
231633                                zydeco spice mix
231634                       zydeco ya ya deviled eggs
231635          cookies by design   cookies on a stick
231636    cookies by design   sugar shortbread cookies
Name: name, Length: 226657, dtype: object

In [29]:
duplicate_columns = recipes_feat.columns[recipes_feat.columns.duplicated()].tolist()
duplicate_columns

['sugar',
 'chicken',
 'cheese',
 'pasta',
 'potatoes',
 'eggs',
 'nuts',
 'rice',
 'tomatoes',
 'onions',
 'apples',
 'shrimp',
 'mushrooms',
 'yeast',
 'carrots',
 'corn',
 'strawberries',
 'broccoli',
 'bananas',
 'spinach',
 'bacon',
 'lemon',
 'coconut',
 'ham',
 'spaghetti',
 'asparagus',
 'blueberries',
 'cauliflower']

In [37]:
len(duplicate_columns)

28

In [30]:
column_counts = recipes_feat.columns.value_counts()
print("\nColumn name counts:")
print(column_counts)


Column name counts:
spaghetti           2
eggs                2
cauliflower         2
pasta               2
corn                2
                   ..
almond extract      1
fresh mushrooms     1
ground coriander    1
lemon, juice of     1
wedding             1
Name: count, Length: 550, dtype: int64


In [34]:
recipes_feat_clean = recipes_feat.loc[:, ~recipes_feat.columns.duplicated()]

In [36]:
recipes_feat_clean.head()

Unnamed: 0,name,id,minutes,n_steps,n_ingredients,calories,fat,sugar,sodium,protein,...,granola-and-porridge,penne,deep-fry,steaks,spanish,white-rice,whole-chicken,steak,chowders,wedding
0,arriba baked winter squash mexican style,137739,55,11,7,51.5,0.0,13.0,0.0,2.0,...,False,False,False,False,False,False,False,False,False,False
1,a bit different breakfast pizza,31490,30,9,6,173.4,18.0,0.0,17.0,22.0,...,False,False,False,False,False,False,False,False,False,False
2,all in the kitchen chili,112140,130,6,13,269.8,22.0,32.0,48.0,39.0,...,False,False,False,False,False,False,False,False,False,False
3,alouette potatoes,59389,45,11,11,368.1,17.0,10.0,2.0,14.0,...,False,False,False,False,False,False,False,False,False,False
4,amish tomato ketchup for canning,44061,190,5,8,352.9,1.0,337.0,23.0,3.0,...,False,False,False,False,False,False,False,False,False,False


In [53]:
recipes_feat_clean.duplicated().sum()

np.int64(0)

In [35]:
column_counts = recipes_feat_clean.columns.value_counts()
print("\nColumn name counts:")
print(column_counts)


Column name counts:
name             1
lunch            1
side-dishes      1
healthy-2        1
comfort-food     1
                ..
chicken          1
cream            1
skim milk        1
cooking spray    1
wedding          1
Name: count, Length: 550, dtype: int64


In [39]:
# Save feature matrix as csv
recipes_feat_clean.to_parquet('data/recipes_feat.parquet')

In [42]:
recipes_new = recipes.drop(columns=['contributor_id', 'submitted'])

In [54]:
recipes_new.head()

Unnamed: 0,name,id,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,calories,fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,"[30-minutes-or-less, time-to-make, course, mai...",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,"[time-to-make, course, preparation, main-dish,...",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,"[60-minutes-or-less, time-to-make, course, mai...",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,"[weeknight, time-to-make, course, main-ingredi...",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [55]:
recipes_new.to_parquet('data/recipes.parquet')