In [1]:
import json
with open('recipedata_sample.json') as fp:
    data = json.load(fp)   

In [2]:
recipes = [str.join("#", filter(lambda item: item != '', map(lambda item: item[2].lower(), recipe['individualIngredient']))) for recipe in data]

In [3]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(token_pattern="[^#]+", min_df=2)
matrix = vec.fit(recipes).transform(recipes)

df = pd.DataFrame(matrix.todense(), columns=vec.get_feature_names())

len(vec.get_feature_names())

604

In [4]:
df.sum().sort_values(ascending=False)

sugar                                            61.959728
salt                                             51.698832
flour                                            47.950488
milk                                             43.864452
water                                            35.638641
butter                                           34.007765
vanilla                                          30.026534
baking powder                                    25.987423
brown sugar                                      21.079546
baking soda                                      20.127948
margarine                                        19.381579
sour cream                                       19.029477
pepper                                           18.582552
oil                                              18.460444
vinegar                                          16.829814
cinnamon                                         16.814078
chopped onion                                    16.7978

In [7]:
import numpy as np

In [40]:
df = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
df[df == 0.0] = np.nan
df.head(10)

Unnamed: 0,+,a.,accent,all-purpose flour,allspice,almond extract,almond flavoring,apples,applesauce,bacon,...,whole milk,whole wheat flour,wine vinegar,worcestershire,worcestershire sauce,yeast,yellow cake mix,yellow squash,yolks,zucchini
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,0.305554,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,0.345973,,,,,,,...,,,,,,,,,,


In [75]:
df = df.drop(df.columns[df.notnull().sum() < 10], axis=1)

In [76]:
means = df.mean(axis=1)
normalized = df.sub(means, axis=0)
normalized.head(10)

Unnamed: 0,all-purpose flour,baking powder,baking soda,basil,beaten,boiling water,brown sugar,butter,butter or margarine,"butter, melted",...,soda,sour cream,soy sauce,sugar,vanilla,vegetable oil,vinegar,water,white sugar,worcestershire sauce
0,,,0.095544,,,,,,,,...,,,,-0.045001,,,,,,
1,,,,,,,,0.058443,,,...,,,,-0.052742,0.052555,,,,,
2,,,0.031652,,,,,,,,...,,,,-0.090424,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,-0.009065,,,...,,,,-0.133639,,,,,,
5,-0.009501,,-0.059911,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,0.021217,,,,,,,,,,...,,,,,,,,,,


In [77]:
sqrt_sum_squares = normalized.pow(2).sum().pow(0.5)

In [78]:
similarity = pd.DataFrame(index = normalized.columns.values, columns = normalized.columns.values)
similarity

Unnamed: 0,all-purpose flour,baking powder,baking soda,basil,beaten,boiling water,brown sugar,butter,butter or margarine,"butter, melted",...,soda,sour cream,soy sauce,sugar,vanilla,vegetable oil,vinegar,water,white sugar,worcestershire sauce
all-purpose flour,,,,,,,,,,,...,,,,,,,,,,
baking powder,,,,,,,,,,,...,,,,,,,,,,
baking soda,,,,,,,,,,,...,,,,,,,,,,
basil,,,,,,,,,,,...,,,,,,,,,,
beaten,,,,,,,,,,,...,,,,,,,,,,
boiling water,,,,,,,,,,,...,,,,,,,,,,
brown sugar,,,,,,,,,,,...,,,,,,,,,,
butter,,,,,,,,,,,...,,,,,,,,,,
butter or margarine,,,,,,,,,,,...,,,,,,,,,,
"butter, melted",,,,,,,,,,,...,,,,,,,,,,


In [79]:
for i in similarity.index.values:
    for j in similarity.index.values:
        numerator = (normalized[i] * normalized[j]).sum()
        denominator = sqrt_sum_squares.loc[i] * sqrt_sum_squares.loc[j]
        similarity.loc[i,j] = numerator / denominator
        
similarity

Unnamed: 0,all-purpose flour,baking powder,baking soda,basil,beaten,boiling water,brown sugar,butter,butter or margarine,"butter, melted",...,soda,sour cream,soy sauce,sugar,vanilla,vegetable oil,vinegar,water,white sugar,worcestershire sauce
all-purpose flour,1,-0.0702673,0.0087587,0,0.171076,0,-0.00683675,-0.00602921,0.0448407,0.0164033,...,0.00421935,0,0,-0.0914012,-0.0315408,0.109803,0,-0.0226029,0,0
baking powder,-0.0702673,1,0.187601,0,0.158147,0,0.029566,0.0364112,0.054866,0.0306936,...,0.0165404,0.0128525,0,-0.0491739,0.139415,0.0435455,0,0.0190346,-0.0367839,0
baking soda,0.0087587,0.187601,1,0,0.119026,0.00594738,0.0738288,0.0270544,0,0.024687,...,0,0.0325603,0,-0.247788,0.0424269,0.0294174,-0.0100553,-0.00200078,-0.00521063,0
basil,0,0,0,1,0,0,0,-0.0519474,0,-0.00512936,...,0,-0.0313876,0,0,0,0,0,-0.139765,0,0
beaten,0.171076,0.158147,0.119026,0,1,0,0.00127211,0.0312455,0,0.005958,...,0.0879812,0,0,-0.187346,-0.00975809,0.145916,0.00237126,-0.00851343,0.00105264,-0.00435293
boiling water,0,0,0.00594738,0,0,1,-0.00940007,0,0.0218794,0,...,0.0999436,-0.00441147,0,-0.259037,-0.0441459,0,0,0,0,0
brown sugar,-0.00683675,0.029566,0.0738288,0,0.00127211,-0.00940007,1,-0.0183589,-0.0256829,-0.0209047,...,-0.00752524,0,-0.0439879,0.000203406,-0.073329,-0.00624919,0.0253053,0.0411754,-0.0504064,-0.0664033
butter,-0.00602921,0.0364112,0.0270544,-0.0519474,0.0312455,0,-0.0183589,1,0,-0.0181662,...,-0.00253673,0.0168395,-0.0179899,-0.0153526,0.0710367,0,0.0106821,0.0116406,-0.0513984,-0.0423945
butter or margarine,0.0448407,0.054866,0,0,0,0.0218794,-0.0256829,0,1,0,...,-7.56105e-05,0.00378393,0,-0.0974887,-0.0314051,0,0,-0.0464413,0,0.0201306
"butter, melted",0.0164033,0.0306936,0.024687,-0.00512936,0.005958,0,-0.0209047,-0.0181662,0,1,...,0,-0.00502666,0,-0.0919946,-0.0363256,0,0,0,0,0.0209599


In [87]:
def most_similar_to(ingredient):
    ranked = similarity[ingredient].sort_values(ascending=False)
    return (ranked.index[1], ranked[1])
    
most_similar_to('orange juice') 

(u'cornstarch', 0.079868757374751728)

In [122]:
print similarity.loc['baking soda', 'cinnamon'], normalized.loc[0, 'baking soda']
print similarity.loc['flour', 'cinnamon'], normalized.loc[0, 'flour']
print similarity.loc['salt', 'cinnamon'], normalized.loc[0, 'salt']
print similarity.loc['sugar', 'cinnamon'], normalized.loc[0, 'sugar']

0.189403841089 0.0955438142395
-0.210974271408 -0.010956251813
-0.172000788156 -0.0395869430777
-0.256053626596 -0.0450006193488


In [116]:
def predict_rating_for(recipe, ingredient):
    return (normalized.loc[recipe] * similarity.loc[ingredient]).sum() / normalized.loc[recipe].abs().sum()

suggestions = pd.DataFrame([], columns=similarity.columns.values)
suggestions
recipe = df.loc[0].copy()

for i,j in enumerate(recipe):
    ingredient = df.columns.values[i]
    if not(pd.isnull(j)):
        print ingredient

        
for i,j in enumerate(recipe):
    ingredient = df.columns.values[i]
    if pd.isnull(j):
#         print ingredient, predict_rating_for(0, ingredient)
        suggestions.loc[0, ingredient] = predict_rating_for(0, ingredient)
# 
suggestions.T.sort_values(0, ascending=False)

baking soda
flour
salt
sugar


Unnamed: 0,0
cinnamon,0.202731
shortening,0.159528
beaten,0.144249
chopped nuts,0.129206
oil,0.126293
baking powder,0.123522
soda,0.10814
nutmeg,0.100298
vegetable oil,0.0953001
powdered sugar,0.0828974
