In [39]:
import pandas as pd
import numpy as np
import re

In [14]:
#Download the recipe database.
!curl -O https://s3.amazonaws.com/openrecipes/20170107-061401-recipeitems.json.gz 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.3M  100 29.3M    0     0  4448k      0  0:00:06  0:00:06 --:--:-- 6283k


In [17]:
#Unzip the database.
!gunzip 20170107-061401-recipeitems.json.gz

In [25]:
#Read the file into a dataframe.
with open('20170107-061401-recipeitems.json', 'r', encoding="utf-8") as f:
    data = (line.strip() for line in f)
    data_json = "[{0}]".format(','.join(data))

In [26]:
#Transform the data to a dataframe from JSON format
recipes = pd.read_json(data_json)

In [50]:
#Examine the dimensions of the dataframe.
recipes.shape 

(173278, 17)

In [51]:
#Perform aggrigations on the data set.
recipes.ingredients.str.len().describe() 

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [52]:
#Determine which recipe has the longest ingredient list.
recipes.name[np.argmax(recipes.ingredients.str.len())]

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [35]:
#Determine how many of the recipes are for breakfast food.
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [36]:
#Determine how many of the recipes contain cinnamon.
recipes.description.str.contains('[Cc]innamon').sum()

2290

In [37]:
#Determine how many of the recipes misspelled cinnamon as an ingredient.
recipes.description.str.contains('[Cc]inamon').sum()

1

In [38]:
#Create a list of target ingredients.
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
             'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [40]:
spice_df = pd.DataFrame(
dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
    for spice in spice_list))

In [41]:
spice_df.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [47]:
selection = spice_df.query('parsley & paprika & tarragon') #Retrieve index of recipes containing target igredients.

In [49]:
recipes.name[selection.index] #Return recipe names.

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object