### Loading packages

In [1]:
import os
import sys
import io
import pickle
import gzip
import zipfile

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sqlite3
from sqlite3 import Error

#### Reading Data Folder

In [2]:
path = '../data'

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
            files.append(os.path.join(r, file))

for f in files[0:20]:
    print(f)

../data/2ndReadme.md
../data/.gitattributes
../data/1stReadme.md
../data/fao/FAOSTAT_livestock_2018.csv
../data/fao/FAOSTAT_crops_2018.csv
../data/kgraph/catalog-v001.xml
../data/kgraph/wiki_sentences_v2.csv
../data/pickles/yummly.pkl.gzip
../data/pickles/ingr_map.pkl
../data/pickles/df_kaggle_food.pkl.gzip
../data/ACDH/ACDH_CH_WG4.owl
../data/ACDH/ACDH_CH_WG4_simplified.gml
../data/ACDH/ACDH_CH_WG4_simplified.graphml
../data/ACDH/ACDH_CH_WG4_simplified.owl
../data/ACDH/ACDH_CH_WG4_simplified.tt
../data/ACDH/ACDH_CH_WG4_simplified.csv
../data/ACDH/ACDH_CH_WG4_simplified.xls
../data/yummly/test.json
../data/yummly/lists.zip
../data/yummly/recipes.zip


## [Food Data from Kaggle competition](https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions)

178265 recipes, no information on cuisine

In [3]:
df_recipes_raw = pd.read_csv('../data/kaggle_food-com/RAW_recipes.csv')
df_recipes_cod = pd.read_csv('../data/kaggle_food-com/PP_recipes.csv')

df_recipes_raw.tags = df_recipes_raw.tags.apply(eval)
df_recipes_raw.ingredients = df_recipes_raw.ingredients.apply(eval)
df_recipes_raw.steps = df_recipes_raw.steps.apply(eval)

In [4]:
df_recipes_raw.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13


In [5]:
df_recipes_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [6]:
df_recipes_cod.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [7]:
df_recipes_cod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178265 entries, 0 to 178264
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 178265 non-null  int64 
 1   i                  178265 non-null  int64 
 2   name_tokens        178265 non-null  object
 3   ingredient_tokens  178265 non-null  object
 4   steps_tokens       178265 non-null  object
 5   techniques         178265 non-null  object
 6   calorie_level      178265 non-null  int64 
 7   ingredient_ids     178265 non-null  object
dtypes: int64(3), object(5)
memory usage: 10.9+ MB


#### Checking the readiness for join

In [8]:
df_recipes_raw[df_recipes_raw.id==424415]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
8586,aromatic basmati rice rice cooker,424415,61,496803,2010-05-10,"[weeknight, time-to-make, course, main-ingredi...","[228.2, 2.0, 2.0, 8.0, 9.0, 1.0, 15.0]",6,"[rinse the rice in a fine strainer , then drai...",from the ultimate rice cooker cookbook. the a...,"[basmati rice, water, salt, cinnamon stick, gr...",5


In [9]:
df_recipes_cod[df_recipes_cod.id==424415]

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"


In [10]:
df_recipes = pd.merge(df_recipes_cod, df_recipes_raw, on='id')
df_recipes.set_index(["id"], drop=True, inplace=True)

df_recipes.drop(['contributor_id', 
                 'submitted', 
                 'n_steps', 
                 'description', 
                 'tags', 
                 'n_ingredients', 
                 'i'], axis=1, inplace=True)

df_recipes = df_recipes[['name', 
                         #'name_tokens', 
                         'ingredients',
                         #'ingredient_tokens',
                         #'ingredient_ids',
                         #'steps',
                         #'steps_tokens',
                         #'techniques',
                         'calorie_level',
                         'minutes',
                         #'nutrition',
                        ]]

df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178265 entries, 424415 to 263840
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   name           178265 non-null  object
 1   ingredients    178265 non-null  object
 2   calorie_level  178265 non-null  int64 
 3   minutes        178265 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 6.8+ MB


In [11]:
df_recipes.sort_index(inplace=True)
df_recipes.head(10)

Unnamed: 0_level_0,name,ingredients,calorie_level,minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,low fat berry blue frozen dessert,"[blueberries, granulated sugar, vanilla yogurt...",0,1485
40,best lemonade,"[sugar, lemons, rind of, fresh water, fresh le...",1,35
45,buttermilk pie with gingersnap crumb crust,"[sugar, margarine, egg, egg whites, flour, sal...",0,80
46,a jad cucumber pickle,"[rice vinegar, thangkwa, hom daeng, nam som pa...",0,25
49,chicken breasts lombardi,"[fresh mushrooms, butter, boneless skinless ch...",2,75
52,cafe cappuccino,"[instant coffee, sugar, nonfat dry milk solid,...",0,5
53,jimmy g s carrot cake,"[all-purpose flour, sugar, baking powder, baki...",1,110
58,low fat burgundy beef vegetable stew,"[beef eye round, vegetable oil, dried thyme le...",1,164
59,lou s fabulous bruschetta,"[french baguette, butter, garlic powder, ricot...",0,40
62,black bean corn and tomato salad,"[fresh lemon juice, olive oil, black beans, fr...",1,25


#### Opening a dataframe for conversion of ingredients into simpler forms

In [12]:
with open('../data/pickles/ingr_map.pkl', 'rb') as f:
    df_ingredients = pickle.load(f)

df_ingredients.replace({'olife': 'olives'}, regex=True, inplace=True)
df_ingredients.replace({'olive': 'olives'}, regex=True, inplace=True)
#df_ingredients[df_ingredients.replaced.str.contains('oliv')]

df_ingredients.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


#### Creating a dictionaire for ingredients

In [13]:
simple_ings = {}
for idx1 in df_ingredients.index:
    simple_ings[df_ingredients.loc[idx1, 'raw_ingr']] = df_ingredients.loc[idx1, 'replaced'] 
len(simple_ings)

11659

In [14]:
simple_ings['chicken broth']

'chicken broth'

Converting the recipes' ingredients to the simpler forms

In [15]:
simpler_ingredients = []
unmapped_ingredients = []
for idx1 in df_recipes.index:
    ingredients = []
    for ingredient in df_recipes.loc[idx1, 'ingredients']:
        try:
            #print(ingredient)
            ing = simple_ings[ingredient]
            ingredients.append(ing)
        except:
            ingredients.append(ingredient)
            unmapped_ingredients.append(ingredient)
    simpler_ingredients.append(ingredients)
print(len(simpler_ingredients))
print(len(unmapped_ingredients))

178265
38346


In [16]:
df_recipes['simpler_ingredients'] = simpler_ingredients
df_recipes.columns = df_recipes.columns
new_order = list(df_recipes.columns)
new_order.insert(3,new_order.pop())
df_recipes = df_recipes[new_order]
df_recipes.head()

Unnamed: 0_level_0,name,ingredients,calorie_level,simpler_ingredients,minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
38,low fat berry blue frozen dessert,"[blueberries, granulated sugar, vanilla yogurt...",0,"[blueberry, granulated sugar, vanilla yogurt, ...",1485
40,best lemonade,"[sugar, lemons, rind of, fresh water, fresh le...",1,"[sugar, lemons, rind of, fresh water, fresh le...",35
45,buttermilk pie with gingersnap crumb crust,"[sugar, margarine, egg, egg whites, flour, sal...",0,"[sugar, margarine, egg, egg white, flmy, salt,...",80
46,a jad cucumber pickle,"[rice vinegar, thangkwa, hom daeng, nam som pa...",0,"[rice vinegar, thangkwa, hom daeng, nam som pa...",25
49,chicken breasts lombardi,"[fresh mushrooms, butter, boneless skinless ch...",2,"[fresh mushroom, butter, chicken breast half, ...",75


In [17]:
with gzip.open('../data/pickles/df_kaggle_food.pkl.gzip', 'wb') as f:
    pickle.dump(df_recipes, f)

## [Food Data from Yummly I](https://www.kaggle.com/c/whats-cooking-kernels-only/data)

39774 recipes, ingredients and cuisine (no recipe name)

In [18]:
with open('../data/yummly/train.json') as data:
    dishes = pd.read_json(data)

In [19]:
dishes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [20]:
print("There are {0} total recipes.".format(len(dishes['id'].unique())))
dishes.head(10)

There are 39774 total recipes.


Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por..."
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [21]:
simpler_ingredients = []
unmapped_ingredients = []
for idx1 in dishes.index:
    ingredients = []
    for ingredient in dishes.loc[idx1, 'ingredients']:
        try:
            #print(ingredient)
            ing = simple_ings[ingredient]
            ingredients.append(ing)
        except:
            ingredients.append(ingredient)
            unmapped_ingredients.append(ingredient)
    simpler_ingredients.append(ingredients)
print(len(simpler_ingredients))
print(len(unmapped_ingredients))
dishes['simpler_ingredients'] = simpler_ingredients

39774
78107


In [22]:
print("There are {0} total cuisines available.".format(len(dishes['cuisine'].unique())))
dishes['cuisine'].value_counts()

There are 20 total cuisines available.


italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [23]:
dishes.explode('ingredients').head(10)

Unnamed: 0,id,cuisine,ingredients,simpler_ingredients
0,10259,greek,romaine lettuce,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,black olives,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,grape tomatoes,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,garlic,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,pepper,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,purple onion,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,seasoning,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,garbanzo beans,"[lettuce, black olives, grape tomato, garlic, ..."
0,10259,greek,feta cheese crumbles,"[lettuce, black olives, grape tomato, garlic, ..."
1,25693,southern_us,plain flour,"[plain flmy, ground pepper, salt, tomato, grou..."


## [Food Data from Yummly II](https://alioben.github.io/yummly/)

16781 recipes, names and cuisine

In [24]:
#archive = zipfile.ZipFile('../data/yummly/recipes.zip', 'r')
#recipe = archive.open('recipes/1-2-3-Jambalaya-1440919.json')

#for f in archive.infolist()[1:]:
#    print(f)
#    recipe = archive.open(f)
#    break

In [25]:
sys.setrecursionlimit(10**6) 

In [26]:
with gzip.open('../data/pickles/yummly.pkl.gzip', 'rb') as f:
    df_yummly = pickle.load(f)

In [28]:
df_yummly.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,PrepTime,img
0,,Healthier-Red-Thai-Chicken-Curry---without-the...,"[olive oil, chicken breasts, chillies, ginger,...",Healthier Red Thai Chicken Curry - without the...,thai,40.0,https://lh3.googleusercontent.com/-Z0P2wWVQZb1...
1,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",20-Minute-Thai-Basil-Beef-and-Lemongrass-Rice-...,"[jasmine rice, coconut milk, water, toasted se...",20 Minute Thai Basil Beef and Lemongrass Rice ...,thai,20.0,https://lh3.googleusercontent.com/VYPE5CRG5_CM...
2,,Thai-Pumpkin-Laksa-with-Crunchy-_Fried_-Chickp...,"[sesame oil, garlic, fresh ginger, seeds, gree...","Thai Pumpkin Laksa with Crunchy ""Fried"" Chickp...",thai,60.0,https://lh3.googleusercontent.com/VBifCtNkf59X...
3,"{'piquant': 0.5, 'meaty': 0.6666666666666666, ...",Thai-Basil-Chicken-1927402,"[vegetable oil, sriracha sauce, crushed red pe...",Thai Basil Chicken,thai,18.0,https://lh3.googleusercontent.com/mB505KCWSH7h...
4,,Skinny-Thai-Chicken-Meatballs-with-Peanut-Sauc...,"[ground chicken, green onions, chopped cilantr...",Skinny Thai Chicken Meatballs with Peanut Sauce,thai,45.0,https://lh3.googleusercontent.com/QNBwHqVOqikz...


In [29]:
df_yummly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16781 entries, 0 to 16780
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   flavors      11680 non-null  object 
 1   id           16781 non-null  object 
 2   ingredients  16781 non-null  object 
 3   recipeName   16781 non-null  object 
 4   cuisine      16781 non-null  object 
 5   PrepTime     16781 non-null  float64
 6   img          16781 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1.0+ MB


In [30]:
df_yummly.drop('img', axis=1, inplace=True)
df_yummly.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,PrepTime
0,,Healthier-Red-Thai-Chicken-Curry---without-the...,"[olive oil, chicken breasts, chillies, ginger,...",Healthier Red Thai Chicken Curry - without the...,thai,40.0
1,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",20-Minute-Thai-Basil-Beef-and-Lemongrass-Rice-...,"[jasmine rice, coconut milk, water, toasted se...",20 Minute Thai Basil Beef and Lemongrass Rice ...,thai,20.0
2,,Thai-Pumpkin-Laksa-with-Crunchy-_Fried_-Chickp...,"[sesame oil, garlic, fresh ginger, seeds, gree...","Thai Pumpkin Laksa with Crunchy ""Fried"" Chickp...",thai,60.0
3,"{'piquant': 0.5, 'meaty': 0.6666666666666666, ...",Thai-Basil-Chicken-1927402,"[vegetable oil, sriracha sauce, crushed red pe...",Thai Basil Chicken,thai,18.0
4,,Skinny-Thai-Chicken-Meatballs-with-Peanut-Sauc...,"[ground chicken, green onions, chopped cilantr...",Skinny Thai Chicken Meatballs with Peanut Sauce,thai,45.0


In [31]:
#cleaning ingredients

simpler_ingredients = []
unmapped_ingredients = []
for idx1 in df_yummly.index:
    ingredients = []
    for ingredient in df_yummly.loc[idx1, 'ingredients']:
        try:
            #print(ingredient)
            ing = simple_ings[ingredient]
            ingredients.append(ing)
        except:
            ingredients.append(ingredient)
            unmapped_ingredients.append(ingredient)
    simpler_ingredients.append(ingredients)
print(len(simpler_ingredients))
print(len(unmapped_ingredients))
df_yummly['simpler_ingredients'] = simpler_ingredients

16781
33791


In [32]:
df_yummly.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,PrepTime,simpler_ingredients
0,,Healthier-Red-Thai-Chicken-Curry---without-the...,"[olive oil, chicken breasts, chillies, ginger,...",Healthier Red Thai Chicken Curry - without the...,thai,40.0,"[olive oil, chicken breast, chillies, ginger, ..."
1,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",20-Minute-Thai-Basil-Beef-and-Lemongrass-Rice-...,"[jasmine rice, coconut milk, water, toasted se...",20 Minute Thai Basil Beef and Lemongrass Rice ...,thai,20.0,"[jasmine rice, coconut milk, water, toasted se..."
2,,Thai-Pumpkin-Laksa-with-Crunchy-_Fried_-Chickp...,"[sesame oil, garlic, fresh ginger, seeds, gree...","Thai Pumpkin Laksa with Crunchy ""Fried"" Chickp...",thai,60.0,"[sesame oil, garlic, fresh ginger, seed, scall..."
3,"{'piquant': 0.5, 'meaty': 0.6666666666666666, ...",Thai-Basil-Chicken-1927402,"[vegetable oil, sriracha sauce, crushed red pe...",Thai Basil Chicken,thai,18.0,"[vegetable oil, sriracha sauce, crushed red pe..."
4,,Skinny-Thai-Chicken-Meatballs-with-Peanut-Sauc...,"[ground chicken, green onions, chopped cilantr...",Skinny Thai Chicken Meatballs with Peanut Sauce,thai,45.0,"[ground chicken, scallion, chopped cilantro, f..."


In [33]:
with gzip.open('../data/pickles/df_yummly.pkl.gzip', 'wb') as f:
    pickle.dump(df_yummly, f)