In [1]:
from pymongo import MongoClient
import pandas as pd
import re
from src.feature_building import count_keyword_1and2grams, token_pipeline
from nltk.util import ngrams

In [2]:
client = MongoClient()
db = client['recipes']

In [3]:
regx = re.compile('cookies', re.IGNORECASE)

In [4]:
def keyword_hierarchy(curs, dir_path):
    precedence = {}
    word_counts, n2gram_counts = count_keyword_1and2grams(curs)
    with open(dir_path + 'features_1gram.txt', 'r') as f1:
        for line in f1:
            keyword = line.strip()
            precedence[keyword] = word_counts[keyword]
    bonus2 = max(precedence.values())
    with open(dir_path + 'features_2gram.txt', 'r') as f2:
        for line in f2:
            n2gram = line.strip()
            precedence[n2gram] = n2gram_counts[n2gram] + bonus2
    return precedence       

In [5]:
def identify_ingred(ingred_line, precedence_dict):
    tokens = token_pipeline(ingred_line['text'])
    n2grams = {" ".join(gram) for gram in ngrams(tokens,2)}
    keywords = (n2grams.union(set(tokens))).intersection(precedence_dict)
    if keywords:                                                 
        best_keyword = max(keywords, key=precedence_dict.get)
        return best_keyword, ingred_line['weight']
    else:
        message = "no keyword in line: " + " ".join(tokens)
        with open('unidentified_lines.log', 'w') as log_f:
            log_f.write(message + "\n")
        print message                                                        

In [6]:
def get_recipe_features(recipe_dict, precedence_dict):
    tot = recipe_dict['totalWeight']
    features = {'label': recipe_dict['label']}
    for ingred_line in recipe_dict['ingredients']:
        ingred = identify_ingred(ingred_line, precedence_dict)
        if ingred:
            features[ingred[0]] = ingred[1]
    return features

In [7]:
precedence = keyword_hierarchy(db.eda_cookies.find({"label":regx}), 'src/')

In [8]:
curs = db.eda_cookies.find({"label":regx})
data = [get_recipe_features(doc, precedence) for doc in curs]

no keyword in line: bicarbon soda
no keyword in line: fleur de sel
no keyword in line: soda
no keyword in line: master cooki mix
no keyword in line: dozen cooki
no keyword in line: fleur de sel
no keyword in line: ml wheat bran
no keyword in line: pit prune
no keyword in line: flax
no keyword in line: mace
no keyword in line: cooki
no keyword in line: xylitol
no keyword in line: bicarbon soda
no keyword in line: rainbow sprinkl
no keyword in line: recip birthday cake crumb recip follow
no keyword in line: birthday cake crumb
no keyword in line: rainbow sprinkl
no keyword in line: ice
no keyword in line: meringu powder
no keyword in line: meringu powder
no keyword in line: x shortbread cooki use walker
no keyword in line: gluten free shortbread cooki
no keyword in line: fine emmer wheat
no keyword in line: store bought italian shortbread cooki such as rigoli
no keyword in line: shortbread cooki
no keyword in line: red and green hard candi
no keyword in line: uncook polenta
no keyword in

In [10]:
df = pd.DataFrame(data)

In [12]:
df['sugar']

0       226.796188
1       248.058334
2       300.000000
3       150.000000
4       200.000000
5        85.000000
6              NaN
7       283.495239
8       100.000000
9        90.000000
10      300.000000
11             NaN
12             NaN
13             NaN
14      200.000000
15       50.000000
16      100.000000
17      200.000000
18      200.000000
19      200.000000
20       50.000000
21       75.599998
22             NaN
23      100.000000
24      150.000000
25      200.000000
26      175.000000
27      150.000000
28      230.000000
29      200.000000
           ...    
2224           NaN
2225           NaN
2226      8.994554
2227     22.496737
2228           NaN
2229    200.000000
2230    200.000000
2231           NaN
2232    200.000000
2233    200.000000
2234     66.666664
2235     50.000000
2236    200.000000
2237    200.000000
2238    100.000000
2239     50.000000
2240    200.000000
2241    150.000000
2242           NaN
2243    200.000000
2244     12.600000
2245    200.

In [33]:
samp = db.eda_cookies.find_one()

In [39]:
for line in samp['ingredients']:
    print identify_line(line, precedence)

('flour', 481.94189453125)
('bake powder', 6.900000095367432)
('bake soda', 5.75)
('salt', 6.340129375457764)
('butter', 283.4952392578125)
('brown sugar', 283.4952392578125)
('sugar', 226.7961883544922)
('egg', 100.0)
('vanilla extract', 8.399999618530273)
('bittersweet chocol', 538.6409301757812)


In [13]:
d_test = {'a':1, 'b':2, 'c':3}
set_test = {'b', 'c', 'd'}

In [29]:
empty_set = set()

In [31]:
if set_test:
    print "yes"

yes
