In [1]:
import pandas as pd
import numpy as np
import re
import json
import datetime

def time_cleaner(a_string):
    raw = str(a_string).replace('PT', '')
    
    if 'DT' in raw:
        clean_preptime = np.nan
    elif 'H' in raw and 'M' in raw:
        time = re.findall(r'(\d{1,9})', raw)
        hour = int(time[0]) * 60
        minute = int(time[1])
        clean_preptime = hour + minute
    elif 'H' in raw:
        clean_preptime = int(raw.replace('H', '')) * 60
    elif 'M' in raw:
        clean_preptime = raw.replace('M', '')
    else:
        clean_preptime = np.nan

    return clean_preptime

def date_cleaner(a_string):
    if a_string == None:
        return np.nan
    return re.sub('T.*', '', str(a_string))

def ingredient_clean(ingredient_string):
    regex = re.compile(r'½|⅓|¼|\(|\)|(\d-\d) |\d|,|\/|(oz)|(cup)|(teaspoons)|(teaspoon)|(tablespoons)|(tablespoon)|(whole)|(chopped)|( and )|(ground)|( or )|(ounce)|(weight)|( to )|( can )|( of )|(tbsp)', re.I)
    ingredient_string = regex.sub('', ingredient_string)
    return ingredient_string.replace('\n','').lower()

In [2]:
cleaner = []
with open('testing.json') as data_file:
    raw_file = data_file.readlines()
    count = 1
    for x in raw_file:
        fixed = re.sub(r'( { "\$oid" : "(.*?)(" }))', str(count), x)
        fixed = re.sub(r'("ts" : { "\$date" : (\d{1,}))( }, )', '', fixed)
        count += 1
        cleaner.append(json.loads(fixed))
raw_recipes = pd.DataFrame(cleaner)

In [3]:
raw_recipes['cookTime'] = [time_cleaner(count) for count in raw_recipes['cookTime']]
raw_recipes['totalTime'] = [time_cleaner(count) for count in raw_recipes['totalTime']]
raw_recipes['prepTime'] = [time_cleaner(count) for count in raw_recipes['prepTime']]
raw_recipes['datePublished']= [date_cleaner(date) for date in raw_recipes['datePublished']]
raw_recipes['ingredients'] = [ingredient_clean(ingredient) for ingredient in raw_recipes['ingredients']]
raw_recipes = raw_recipes.drop(['creator', 'dateModified'], 1)
# For full file, add 'recipeInstructions' to above drop list

In [4]:
from collections import Counter
import matplotlib.pyplot as plt

giant_string = ''
for ingredient in raw_recipes['ingredients']:
    giant_string = giant_string + ingredient

giant_string = giant_string.split(' ')
word_counts = dict(Counter(giant_string))
raw_recipes.head(2)

Unnamed: 0,_id,cookTime,datePublished,description,image,ingredients,name,prepTime,recipeCategory,recipeYield,source,totalTime,url
0,1,30,2013-03-11,"Late Saturday afternoon, after Marlboro Man ha...",http://static.thepioneerwoman.com/cooking/file...,biscuits s all-purpose flour baking powder s...,Drop Biscuits and Sausage Gravy,10,,12,thepioneerwoman,,http://thepioneerwoman.com/cooking/2013/03/dro...
1,2,20,2013-03-13,"When I was growing up, I participated in my Ep...",http://static.thepioneerwoman.com/cooking/file...,dinner rollssmall sandwich buns i used whea...,Hot Roast Beef Sandwiches,20,,12,thepioneerwoman,,http://thepioneerwoman.com/cooking/2013/03/hot...


In [53]:
fc_indr_raw = pd.read_csv('master_fc_ing.csv')
fc_indr_raw['ingredient_name'] = [ingredient.replace('_', ' ') for ingredient in fc_indr_raw['ingredient_name']]
fc_indr_raw.head()

Unnamed: 0,ingredient_id,ingredient_name,ingredient_category,compound_id,compound_name,cas_number
0,1392,abies alba,plant,906,bornyl_acetate,76-49-3
1,1259,abies alba pine needle,plant,861,maltol,118-71-8
2,1079,abies balsamea oil,plant derivative,673,myrcene,123-35-3
3,22,abies canadensis,plant,906,bornyl_acetate,76-49-3
4,103,abies concolor,plant,906,bornyl_acetate,76-49-3


In [48]:
fc_ingr_only = fc_indr_raw[['ingredient_name','compound_id']].set_index('ingredient_name')
fc_ingr_only_dict = fc_ingr_only['compound_id'].to_dict()
fc_ingr_only_dict

{'orthodon tenuicaule oil': 731,
 'welsh onion': 472,
 'raw leek': 118,
 'ashanti pepper': 1021,
 'acacia farnesiana': 94,
 'pomelo peel': 44,
 'european cranberry': 555,
 'ximenia aegyptiaca': 995,
 'haddock': 961,
 'cereal': 836,
 'thuja standishii': 715,
 'bitter orange oil': 1045,
 'helichrysum italicum': 442,
 'calamus': 554,
 'populus balsamifera': 681,
 'sperm whale oil': 811,
 'sudachi': 78,
 'satsuma mandarin peel oil': 28,
 'citrus fruit blossom': 522,
 'raw pea': 555,
 'cinnamon leaf oil': 165,
 'fermented tea': 556,
 'phellodendron japonicum': 673,
 'concord grape': 554,
 'brewed tea': 556,
 'capsicum': 620,
 'calytrix tetragona oil': 1088,
 'aconitum napellus': 797,
 'cypress': 1078,
 'shiitake': 556,
 'clary sage': 554,
 'myrrh oil': 1022,
 'american storax': 347,
 'jasminum odoratissimum oil': 912,
 'pork liver': 960,
 'butter': 556,
 'bergamot': 1015,
 'bog blueberry': 554,
 'cinnamon oil': 934,
 'congo geranium': 1040,
 'artemisia santolinifolia oil': 767,
 'crab': 108

In [43]:
recipe_dict = raw_recipes[['_id', 'ingredients']].set_index('_id')
recipe_dict = recipe_dict['ingredients'].to_dict()
recipe_dict

{1: 'biscuits s all-purpose flour  baking powder  salt- stick   cold butter cut into pieces-  butermilk sausage gravy pound breakfast sausage hotmild  all-purpose flour s  milk  seasoned salt  black pepper moretaste',
 2: '  dinner rollssmall sandwich buns i used  wheat pound thinly shaved roast beefham or both! pound cheese provolone swiss mzarella even cheez whiz!  mayonnaise  grated onion or   dried onion flakes  poppy seeds  spicy mustard  horseradish mayostraight prepared horseradish dashworcestershire optional dressing ingredients: sriracha hot sauce dried onion flakes insteadfresh garlic powder pepper etc.',
 3: 'dressing:  cumin seeds    ml extra virgin olive oil  fresh lemon juice  honey  fine sea salt plus moretaste  cayenne pepper s carrots shredded on a box gratersliced whisper thin on a mandolin s cooked chickpeas or one -  can drainedrinsed    g  dried pluots plumsdates cut into chickpea-sized pieces    g fresh mint tornfor serving: lotstoasted almond slices driedfresh ro

In [None]:
recipe_fc_list = []
for recipe_id, recipe in recipe_dict.items():
    for fc_ingredient, fc_id in fc_ingr_only_dict.items():
        if fc_ingredient in recipe:
            recipe_fc_list.extend((recipe_id,fc_id))
recipe_fc_dict

orthodon tenuicaule oil
welsh onion
raw leek
ashanti pepper
acacia farnesiana
pomelo peel
european cranberry
ximenia aegyptiaca
haddock
cereal
thuja standishii
bitter orange oil
helichrysum italicum
calamus
populus balsamifera
sperm whale oil
sudachi
satsuma mandarin peel oil
citrus fruit blossom
raw pea
cinnamon leaf oil
fermented tea
phellodendron japonicum
concord grape
brewed tea
capsicum
calytrix tetragona oil
aconitum napellus
cypress
shiitake
clary sage
myrrh oil
american storax
jasminum odoratissimum oil
pork liver
butter
bergamot
bog blueberry
cinnamon oil
congo geranium
artemisia santolinifolia oil
crab
orange tree
sassafras root oil
seal
monodora grandiflora
roasted sesame seed
spearmint
cuttlefish
violet flower oil
boiled chicken
mountain papaya
eucalyptus microcorys leaf oil
nemuaron humboldtii oil
valerian
tobacco oil
chestnut honey
choke cherry
lingonberry juice
hinoki oil
chinese star anise oil
michelia champaca oil
apricot kernel
whale
labiatae
rose
carrot
cigarette
li