In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from src.data_processing import get_recipe_df, mean_scale_recipes, drop_uncommon_ingreds
from pymongo import MongoClient
from src.recipe_distance import *

In [2]:
client = MongoClient()
db = client['recipes']
coll = db.eda_cookies

cookie_regx = re.compile("cookie", re.IGNORECASE)

In [3]:
df_cookies = get_recipe_df(coll, cookie_regx)

In [40]:
df_scaled = mean_scale_recipes(df_cookies)

In [43]:
df_reduced = drop_uncommon_ingreds(df_scaled, 20)

In [44]:
df_reduced.shape

(2488, 102)

In [45]:
df_scaled.shape

(2488, 245)

In [34]:
df_binary = df_reduced>0

In [36]:
df_binary.head().T

url,http://www.seriouseats.com/recipes/2010/02/the-best-chocolate-chip-cookies-kumiko-recipe-20100201.html,http://www.seriouseats.com/recipes/2012/06/chocolate-chocolate-chip-cookies-recipe.html,http://www.seriouseats.com/recipes/2012/12/brownie-chocolate-chip-cookies-recipe.html,http://www.seriouseats.com/recipes/2012/12/se-swap-chocolate-chip-cookies-recipe.html,http://www.seriouseats.com/recipes/2011/12/bacon-chocolate-chip-cookies-recipe.html
allspic,False,False,False,False,False
almond,False,False,False,False,False
almond extract,False,False,False,False,False
almond flour,False,False,False,False,False
almond meal,False,False,False,False,False
appl,False,False,False,False,False
applesauc,False,False,False,False,False
apricot,False,False,False,False,False
bacon,False,False,False,False,True
bake powder,True,True,True,False,False


In [47]:
pdist(df_reduced, metric='cosine')

array([ 0.34384383,  0.55835529,  0.3471587 , ...,  0.52642452,
        0.6312994 ,  0.53389407])

In [22]:
pair_dist_composite(df, ratio=0)

array([[ 0.        ,  0.46153846,  0.46153846, ...,  0.5625    ,
         0.4       ,  0.625     ],
       [ 0.46153846,  0.        ,  0.57142857, ...,  0.46666667,
         0.58823529,  0.625     ],
       [ 0.46153846,  0.57142857,  0.        , ...,  0.64705882,
         0.58823529,  0.70588235],
       ..., 
       [ 0.5625    ,  0.46666667,  0.64705882, ...,  0.        ,
         0.41176471,  0.61111111],
       [ 0.4       ,  0.58823529,  0.58823529, ...,  0.41176471,
         0.        ,  0.55555556],
       [ 0.625     ,  0.625     ,  0.70588235, ...,  0.61111111,
         0.55555556,  0.        ]])

In [7]:
jac_dist = recipe_jaccard_pairs(df>1)

In [9]:
jac_dist

array([ 0.8       ,  1.        ,  1.        , ...,  0.83333333,
        1.        ,  1.        ])

In [17]:
cos_dist = pdist(df, metric='cosine')

In [18]:
cos_dist

array([ 0.34384383,  0.55835529,  0.3471587 , ...,  0.52642452,
        0.6312994 ,  0.53389407])

In [58]:
squareform(jac_dist+cos_dist)/2

array([[ 0.        ,  0.57192192,  0.77917765, ...,  0.64751984,
         0.67962082,  0.78512239],
       [ 0.57192192,  0.        ,  0.6862546 , ...,  0.77396493,
         0.82883922,  0.83740267],
       [ 0.77917765,  0.6862546 ,  0.        , ...,  0.78078046,
         0.65282216,  0.84704403],
       ..., 
       [ 0.64751984,  0.77396493,  0.78078046, ...,  0.        ,
         0.67987892,  0.8156497 ],
       [ 0.67962082,  0.82883922,  0.65282216, ...,  0.67987892,
         0.        ,  0.76694703],
       [ 0.78512239,  0.83740267,  0.84704403, ...,  0.8156497 ,
         0.76694703,  0.        ]])

In [13]:
df = mean_scale_recipes(drop_uncommon_ingreds(df_cookies, 30))

In [16]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
allspic,2488.0,0.022508,0.199072,0.0,0.000000,0.000000,0.000000,4.771277
almond,2488.0,0.071543,0.348000,0.0,0.000000,0.000000,0.000000,5.799667
almond extract,2488.0,0.036174,0.335252,0.0,0.000000,0.000000,0.000000,8.729957
appl,2488.0,0.014469,0.146928,0.0,0.000000,0.000000,0.000000,2.796536
applesauc,2488.0,0.019695,0.155483,0.0,0.000000,0.000000,0.000000,2.376559
bacon,2488.0,0.017283,0.152562,0.0,0.000000,0.000000,0.000000,2.350144
bake powder,2488.0,0.327170,0.615299,0.0,0.000000,0.000000,0.507171,5.496445
bake soda,2488.0,0.493971,0.667205,0.0,0.000000,0.000000,0.856017,4.166666
banana,2488.0,0.020096,0.166971,0.0,0.000000,0.000000,0.000000,2.895976
bittersweet chocol,2488.0,0.054260,0.269702,0.0,0.000000,0.000000,0.000000,4.435568
