In [145]:
from pymongo import MongoClient
import pandas as pd
import re
from src.feature_building import count_keyword_123grams, token_pipeline
from nltk.util import ngrams
from nltk.cluster.util import cosine_distance
from nltk.cluster.kmeans import KMeansClusterer
from sklearn.decomposition import NMF
import numpy as np
from sklearn.cluster import AgglomerativeClustering

In [45]:
client = MongoClient()
db = client['recipes']
coll = db.eda_cookies

In [20]:
regx = re.compile('cookies', re.IGNORECASE)

In [21]:
def keyword_hierarchy(curs, dir_path):
    precedence = {}
    word_counts, bigram_counts, trigram_counts = count_keyword_123grams(curs)
    with open(dir_path + 'features_1gram.txt', 'r') as f1:
        for line in f1:
            keyword = line.strip()
            precedence[keyword] = word_counts[keyword]
    bonus2 = max(precedence.values())
    with open(dir_path + 'features_2gram.txt', 'r') as f2:
        for line in f2:
            bigram = line.strip()
            precedence[bigram] = bigram_counts[bigram] + bonus2
    bonus3 = max(precedence.values())
    with open(dir_path + 'features_3gram.txt', 'r') as f3:
        for line in f3:
            trigram = line.strip()
            precedence[trigram] = trigram_counts[trigram] + bonus3
    return precedence

In [42]:
def identify_ingred(ingred_line, precedence_dict, from_url="unknow_url", verbose=False):
    tokens = token_pipeline(ingred_line['text'])
    bigrams = {" ".join(gram) for gram in ngrams(tokens,2)}
    trigrams = {" ".join(gram) for gram in ngrams(tokens,3)}
    keywords = (bigrams.union(trigrams).union(set(tokens))).intersection(precedence_dict)
    if keywords:                                                 
        best_keyword = max(keywords, key=precedence_dict.get)
        return best_keyword, ingred_line['weight']
    else:
        message = "no keyword in line: {}  [from: {} ]".format(" ".join(tokens), from_url)
        with open('unidentified_lines.log', 'w') as log_f:
            log_f.write(message + "\n")
        if verbose:
            print message                                                        

In [32]:
def get_recipe_features(recipe_dict, precedence_dict):
    tot = recipe_dict['totalWeight']
    features = {'label': recipe_dict['label']}
    for ingred_line in recipe_dict['ingredients']:
        ingred = identify_ingred(ingred_line, precedence_dict, from_url=recipe_dict['url'])
        if ingred:
            features[ingred[0]] = ingred[1]/tot
    return features

In [39]:
precedence = keyword_hierarchy(db.eda_cookies.find({"label":regx}), 'src/')

In [40]:
curs = db.eda_cookies.find({"label":regx})
data = [get_recipe_features(doc, precedence) for doc in curs]

no keyword in line: dozen cooki  [from: http://www.thekitchn.com/recipe-peanut-butter-fudge-oat-129377]
no keyword in line: ml wheat bran  [from: http://www.cookstr.com/recipes/cranberry-oatmeal-cookies-jan-main]
no keyword in line: pit prune  [from: http://www.wholefoodsmarket.com/recipe/cinnamon-walnut-oatmeal-cookies]
no keyword in line: flax  [from: http://www.seriouseats.com/recipes/2013/03/whole-grain-gluten-free-oatmeal-cookies-recipe.html]
no keyword in line: cooki  [from: https://www.tastingtable.com/entry_detail/chefs_recipes/4679/Carrot_cake_gets_a_cookie_makeover.htm]
no keyword in line: xylitol  [from: http://honestcooking.com/chocolate-peanut-butter-cookies-recipe/]
no keyword in line: rainbow sprinkl  [from: http://www.seriouseats.com/recipes/2011/10/confetti-cookies-momofuku-milk-bar-recipe.html]
no keyword in line: recip birthday cake crumb recip follow  [from: http://www.seriouseats.com/recipes/2011/10/confetti-cookies-momofuku-milk-bar-recipe.html]
no keyword in line

no keyword in line: browni base  [from: http://www.bigoven.com/recipe/fudgy-brownie-cookies/181624]
no keyword in line: cool whip thaw contain  [from: http://frugalanticsrecipes.com/2012/07/let-me-introduce-the-improv-ers-kitchen-meets-girl/]
no keyword in line: rosewat  [from: http://www.grouprecipes.com/70907/rose-crinkled-cookies.html]
no keyword in line: recip essenti biscotti american style recip follow  [from: http://www.seriouseats.com/recipes/2008/12/king-arthur-flours-pistachio-cherry-biscotti-cookie-recipe.html]
no keyword in line: heart shape cooki cutter small medium size  [from: https://food52.com/recipes/16119-sugar-spice-happy-valentine-s-day-cookies]
no keyword in line: yellow royal ice  [from: http://www.marthastewart.com/312990/spice-bee-cookies]
no keyword in line: royal ice  [from: http://www.marthastewart.com/342245/molasses-gingerbread-cookies]
no keyword in line: hard candi such as life saver prefer in sever flavor color  [from: http://www.simplyrecipes.com/recip

In [41]:
df = pd.DataFrame(data)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Columns: 244 entries, agav to zucchini
dtypes: float64(243), object(1)
memory usage: 4.2+ MB


In [37]:
df.describe().T.sort_values('count',ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
flour,1887.0,0.233106,0.108676,0.000000,0.160294,0.226250,0.304928,0.620540
butter,1834.0,0.164052,0.083116,0.000000,0.107099,0.156540,0.207972,0.469038
salt,1766.0,0.004078,0.014729,0.000000,0.001587,0.002465,0.003829,0.315380
sugar,1697.0,0.144812,0.103746,0.000000,0.077874,0.126244,0.188508,0.995637
egg,1543.0,0.068834,0.033751,0.005601,0.048321,0.062681,0.080609,0.568599
vanilla extract,1203.0,0.005811,0.004288,0.000181,0.003317,0.004848,0.007050,0.043225
bake soda,1125.0,0.003720,0.002345,0.000290,0.002250,0.003190,0.004184,0.015365
brown sugar,1014.0,0.138619,0.072834,0.003005,0.092239,0.127797,0.174248,0.613725
bake powder,764.0,0.004410,0.002899,0.000000,0.002332,0.003571,0.005695,0.016521
cinnamon,523.0,0.006621,0.045202,0.000000,0.001549,0.002395,0.003615,0.680751


In [62]:
mostly_empty = descrip_df[descrip_df['count'] < 10].index

In [64]:
len(mostly_empty)

95

In [79]:
df_common = df.drop(mostly_empty, axis=1)r

In [81]:
df_common.drop(['label'], axis=1, inplace=True)

In [82]:
df_common.shape

(2254, 147)

In [123]:
X = df_common.fillna(0).values

In [114]:
k10means = KMeansClusterer(num_means=10,
                          distance=cosine_distance,
                          repeats=12,
                          avoid_empty_clusters=True,
                          conv_test = 1e-6)

In [124]:
k10means.cluster_vectorspace(X)



In [129]:
clus_labels = [k10means.classify_vectorspace(vec) for vec in X]

In [133]:
clusters = {n : list(df['label'][np.array(clus_labels) == n]) for n in xrange(10)}

In [136]:
clusters[0]

[u'Flourless Chocolate Chip Cookies',
 u'Totally Chocolate Chip Cookies',
 u'Chocolate Chip Cookies',
 u'Peanut Butter & Fudge Oatmeal Cookies',
 u'Cookies & Cream Ice Cream',
 u'Double Chocolate Cake Mix Cookies',
 u'Double Almond Chocolate Chip Cookies',
 u'Mocha Sandwich Cookies',
 u'No-Bake Nutella Peanut Butter Cookies',
 u'Macaroon Sandwich Cookies',
 u'Gluten-Free Tuesday: No-Bake Chocolate Orange Cookies',
 u'Butter Cookies',
 u'Chocolate-Hazelnut Sandwich Cookies Recipe 2',
 u'Chocolate Sandwich Cookies',
 u'Chocolate Chip Ice Cream Sandwich Cookies',
 u'Cookies-and-Cream Milk Shakes',
 u'Skull Cookies',
 u'Peppermint Sandwich Cookies',
 u"Frozen Cookies n' Cream Pie",
 u'White Chocolate Macadamia Cookies',
 u'Easiest Cookies Ever!',
 u'Zucchini Chocolate Chip Cookies',
 u'Rolled Barley Cookies: Homemade Christmas Gift',
 u'Spicy Pumpkin Cookies',
 u'Seed Jumble Cookies',
 u'Chocolate Almond Toffee Cookies',
 u'Layered Icebox Cookies',
 u'Coconut Cookies',
 u'Gluten-Free Brown

In [138]:
clusters[2]

[u'Chocolate Chip Cookies',
 u'Bacon Chocolate Chip Cookies',
 u'Banana Chocolate Chip Cookies',
 u'Chocolate Chip Cookies',
 u'Chocolate chip cookies',
 u'Chocolate chip cookies',
 u'Coconut Chocolate Chip Cookies',
 u'Ultimate chocolate chip cookies',
 u'Deconstructed Chocolate Chip Cookies',
 u'Chocolate Chip Cookies',
 u'Lemon Chocolate Chip Cookies',
 u'Chocolate Chocolate Chip Cookies',
 u'Coffee Chocolate Chip Cookies',
 u'Chocolate Chip Cookies',
 u'Coconut Chocolate Chip Cookies',
 u'Mocha Chocolate Chip Cookies',
 u'Cakey Chocolate Chip Cookies',
 u'Chocolate Chip Cookies',
 u'Double Chocolate-Chip Cookies',
 u'Oatmeal Cookies With Candied Ginger And Walnuts',
 u"Naptime's Chocolate-Chip Oatmeal Cookies",
 u'Oatmeal Cookies with Dried Apricots and White Chocolate',
 u'Iced oatmeal cookies',
 u'Dark Chocolate Oatmeal Cookies',
 u'Delicious Oatmeal Cookies',
 u'Chewy Oatmeal Cookies with Maple Sugar and Drizzled Maple Icing',
 u'Chocolate Shortbread Sandwich Cookies',
 u'Double

In [139]:
k24means = KMeansClusterer(num_means=24, distance=cosine_distance, repeats=100)

In [140]:
k24means.cluster_vectorspace(X)

In [143]:
clus_labels = [k24means.classify_vectorspace(vec) for vec in X]
clusters = {n : list(df['label'][np.array(clus_labels) == n]) for n in xrange(10)}

In [144]:
for clus, labels in clusters.iteritems():
    print "Cluster #{}".format(clus)
    print labels[:20]

Cluster #0
[u'Chocolate Chip Cookies Recipe', u'Cranberry Oatmeal Cookies', u'Confetti Cookies', u'Sugar Cookies', u'Salt-and-Pepper Sugar Cookies', u'Brown Sugar Cookies', u'Vanilla Sugar Cookies Recipe', u'Gluten-Free Tuesday: Shortbread Cookies', u'Shortbread Cookies', u'Vanilla Shortbread Cookies', u'Whole-wheat Shortbread Cookies', u'Vanilla Shortbread Cookies', u'Rose Water Shortbread Cookies', u'Pistachio Shortbread Cookies', u'Cherry-Almond Shortbread Cookies', u'Cranberry Pecan Shortbread Cookies', u'Emmer Shortbread Cookies', u'Cardamom-Semolina Shortbread Cookies Recipe', u'Lemon Cookies', u'Gluten-Free Tuesday: Lemon Cookies']
Cluster #1
[u'Austrian Shortbread Cookies', u'Jam Sandwich Cookies', u'Raspberry Sandwich Cookies', u'Raspberry Almond Cookies', u'Layered Icebox Cookies', u'Linzer Cookies', u'Raspberry-Lemon Thumbprint Cookies', u'Lemon Raspberry Thumbprint Cookies', u'Raspberry Almond Thumbprint Cookies (Gluten-Free & Vegan)', u'Raspberry Lemon Thumbprint Cookies R

In [148]:
agg = AgglomerativeClustering(n_clusters=10, affinity='cosine', linkage='average', compute_full_tree=True)

In [149]:
agg_clus_labels = agg.fit_predict(X)

In [150]:
clusters = {n : list(df['label'][np.array(agg_clus_labels) == n]) for n in xrange(10)}
for clus, labels in clusters.iteritems():
    print "Cluster #{}".format(clus)
    print labels[:20]

Cluster #0
[u'Cookies & Cream Ice Cream', u'Layered Icebox Cookies', u'Raspberry Almond Thumbprint Cookies (Gluten-Free & Vegan)', u'Chocolate-Pecan Layered Icebox Cookies', u"Chocolate Bird's Nest Cookies", u'Healthy Peanut Butter & Honey Cookies']
Cluster #1
[u'Flourless Chocolate Chip Cookies', u'Chocolate Peppermint Patty Cookies', u'Cookies and Cream Macarons', u'Chocolate Pecan Cookies', u'Chewy, Sweet & Salty Ritz Cracker Cookies', u'Deep Dark Chocolate Cookies', u'Chocolate Puddle Cookies', u'Chocolate Chai Latte Cookies', u"Stephanie Tyler's Chocolate Surprise Cookies Recipe", u'Primal Chocolate Chip Cookies']
Cluster #2
[u'Chocolate Chip Cookies', u'Chocolate Chocolate Chip Cookies', u'Brownie Chocolate Chip Cookies', u'Chocolate Chip Cookies', u'Bacon Chocolate Chip Cookies', u'Chocolate chip cookies', u'Chocolate Chip Cookies', u'Chocolate Chip Cookies', u'Chocolate Chip Cookies', u'Soft Chocolate Chip Cookies', u'Honey Chocolate-Chip Cookies', u'Banana Chocolate Chip Cooki

In [100]:
nmf_8 = NMF(n_components=8)

In [102]:
W8 = nmf_8.fit_transform(df_common.fillna(0).values)

In [109]:
topic_idx = np.argsort(W8[:,0])[::-1]

In [111]:
df['label'][topic_idx][:20]

1359                             Mexican Cookies Recipe 3
741       White Chocolate Chunk And Macadamia Nut Cookies
1497    Cranberry, Pistachio And White Chocolate Chip ...
83                           Bacon Chocolate Chip Cookies
1971                                 Honey Ginger Cookies
1970                             Black Dog Ginger Cookies
1523                Great-Granny's Old-Time Spice Cookies
427                                 Classic Sugar Cookies
447                         Chocolate Mocha Sugar Cookies
439                            Ginger Spice Sugar Cookies
503                           Rosemary Shortbread Cookies
1728                                  Gingerbread Cookies
1861                     Greek Easter Cookies from Smyrna
831          Vanillekipferl (Anise-Seed Crescent Cookies)
469                            Swedish Shortbread Cookies
1942              Marranitos (Mexican Pig-Shaped Cookies)
519      Sweet and Savory Orange Cumin Shortbread Cookies
919           

In [47]:
cho_chip_regx = re.compile('chocolate.chip.cookie', re.IGNORECASE)

In [72]:
cho_chip = df_common[df_common['label'].apply(lambda label: re.match(cho_chip_regx, label) is not None)]

In [73]:
cho_chip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 2011
Columns: 148 entries, agav to zucchini
dtypes: float64(147), object(1)
memory usage: 43.1+ KB


In [76]:
cho_chip.describe().T.sort_values('count', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
salt,33.0,0.003741,0.002029,0.000444,0.002371,0.003536,0.004701,0.009483
flour,33.0,0.215465,0.103845,0.003947,0.173541,0.220868,0.254704,0.395137
vanilla extract,31.0,0.007936,0.009405,0.002487,0.003644,0.004979,0.007172,0.041094
bake soda,30.0,0.003458,0.001364,0.001440,0.002799,0.003271,0.003846,0.007271
sugar,30.0,0.115181,0.051958,0.012063,0.077137,0.117673,0.139713,0.243151
butter,29.0,0.146994,0.057174,0.003904,0.115079,0.163738,0.183789,0.224165
egg,28.0,0.064094,0.019687,0.031304,0.049949,0.066057,0.078523,0.101799
brown sugar,27.0,0.136085,0.044415,0.056807,0.102721,0.137739,0.152600,0.226361
bake powder,14.0,0.003748,0.001772,0.001440,0.002448,0.003373,0.004863,0.007888
semisweet chocol,13.0,0.225847,0.070711,0.115273,0.160091,0.237322,0.285395,0.331624
