In [1]:
import pandas as pd 
import numpy as np
import requests
import json
import re
import os
import sys
import spacy
from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=sys.maxsize)

In [81]:
# Search functions
def flatten_dict(d):
    t = [v for (k,v) in d.items()]
    return [item for sublist in t for item in sublist]

def simple_length_ranker(product_matches):
    return sorted(product_matches, key=len)

def search_stem(ingredient):
    stemmed_ingredient = stem_ingredient(ingredient)
    product_matches = []
    for product, product_stem in zip(group10['title_lower'], group10['title_lower_stemmed']):
        try:
            if bool(re.search(fr'\b{stemmed_ingredient}\b', product_stem)):
                product_matches.append(product)
        except:
            pass
    product_matches = simple_length_ranker(product_matches)
    return product_matches[0:10]

def search_exact(ingredient):
    product_matches = []
    for product in group10['title_lower']:
        try:
            if bool(re.search(fr'\b{ingredient}\b', product)):
                product_matches.append(product)
        except:
            pass
    product_matches = simple_length_ranker(product_matches)
    return product_matches[0:10]

def get_hypernym(ingredient):
    if ' ' in ingredient:
        ingredient = ingredient.replace(' ', '_')
    hypernym = ''
    try:
        synset = wn.synsets(ingredient)[0]
        hypernym = synset.hypernyms()[0].lemma_names()[0]
    except:
        pass
    return hypernym.replace('_', ' ')

def search_hypernym(ingredient):
    product_matches = []
    hypernym = get_hypernym(ingredient)
    if hypernym:
        product_matches = search_exact(hypernym)
        if len(product_matches) == 0:
            product_matches = search_stem(hypernym)
    return product_matches

def get_hyponyms(ingredient):
    if ' ' in ingredient:
        ingredient = ingredient.replace(' ', '_')
    hyponym_list = []
    try:
        synsets = wn.synsets(ingredient)
        for synset in synsets:
            hyponyms = synset.hyponyms()
            for hyponym in hyponyms:
                hyponym_list += hyponym.lemma_names()
        hyponym_list = [h.replace('_', ' ') for h in hyponym_list]
    except:
        pass
    return hyponym_list

def search_hyponyms(ingredient):
    combined_product_matches = []
    hyponym_list = get_hyponyms(ingredient)
    if len(hyponym_list) > 0:
        for hyponym in hyponym_list:
            product_matches = []
            product_matches = search_exact(hyponym)
            if len(product_matches) == 0:
                product_matches = search_stem(hyponym)
            combined_product_matches += product_matches
    if len(combined_product_matches) > 10: 
        return random.sample(combined_product_matches, 10)
    return combined_product_matches             

def get_noun(ingredient):
    doc = nlp(ingredient)
    return " ".join([token.text for token in doc if token.pos_ == "NOUN"])

def search_noun(ingredient):
    product_matches = []
    noun = get_noun(ingredient)
    if noun:
        if is_food(noun):
            product_matches = search_exact(noun)
            if len(product_matches) == 0:
                product_matches = search_stem(noun)
    return product_matches

def is_food(ingredient):
    synsets = wn.synsets(ingredient)
    for synset in synsets:
        if 'food' in synset.lexname():
            return True
        
def search_noun_multiple(ingredient):
    noun = get_noun(ingredient)
    nouns = {}
    if noun:
        if ' ' in noun:
            for s in noun.split():
                if is_food(s):
                    nouns[s] = search_exact(s)
                    if len(nouns[s]) == 0:
                        nouns[s] = search_stem(s)
            return nouns
    return nouns

def stem_ingredient(ingredient):
    stemmer = PorterStemmer()
    stemmed = " ".join([stemmer.stem(w) for w in ingredient.split()])
    return stemmed

def has_plu_code(ingredient):
    stemmed_ingredient = stem_ingredient(ingredient)
    if stemmed_ingredient in plu['stemmed_name'].values:
        return True
    if ingredient in plu['name'].values:
        return True
    return False

def longest_match(ingredient, direction='backward'):
    product_matches = []
    split_ingredient = ingredient.split()
    while len(split_ingredient) > 1:
        if direction=='backward':
            split_ingredient = split_ingredient[:-1]
        elif direction=='forward':
            split_ingredient = split_ingredient[1:]
        ingredient = " ".join(split_ingredient)
        noun = get_noun(ingredient)
        if noun:
            if is_food(noun):
                product_matches = search_exact(ingredient)
                if len(product_matches) == 0:
                    product_matches = search_stem(ingredient)
                if len(product_matches) > 0:
                    return product_matches
    return product_matches

In [3]:
def convert_utf_fraction(utf): 
    fractions = {
        0x2189: 0.0,  # ; ; 0 # No       VULGAR FRACTION ZERO THIRDS
        0x2152: 0.1,  # ; ; 1/10 # No       VULGAR FRACTION ONE TENTH
        0x2151: 0.11111111,  # ; ; 1/9 # No       VULGAR FRACTION ONE NINTH
        0x215B: 0.125,  # ; ; 1/8 # No       VULGAR FRACTION ONE EIGHTH
        0x2150: 0.14285714,  # ; ; 1/7 # No       VULGAR FRACTION ONE SEVENTH
        0x2159: 0.16666667,  # ; ; 1/6 # No       VULGAR FRACTION ONE SIXTH
        0x2155: 0.2,  # ; ; 1/5 # No       VULGAR FRACTION ONE FIFTH
        0x00BC: 0.25,  # ; ; 1/4 # No       VULGAR FRACTION ONE QUARTER
        0x2153: 0.33333333,  # ; ; 1/3 # No       VULGAR FRACTION ONE THIRD
        0x215C: 0.375,  # ; ; 3/8 # No       VULGAR FRACTION THREE EIGHTHS
        0x2156: 0.4,  # ; ; 2/5 # No       VULGAR FRACTION TWO FIFTHS
        0x00BD: 0.5,  # ; ; 1/2 # No       VULGAR FRACTION ONE HALF
        0x2157: 0.6,  # ; ; 3/5 # No       VULGAR FRACTION THREE FIFTHS
        0x215D: 0.625,  # ; ; 5/8 # No       VULGAR FRACTION FIVE EIGHTHS
        0x2154: 0.66666667,  # ; ; 2/3 # No       VULGAR FRACTION TWO THIRDS
        0x00BE: 0.75,  # ; ; 3/4 # No       VULGAR FRACTION THREE QUARTERS
        0x2158: 0.8,  # ; ; 4/5 # No       VULGAR FRACTION FOUR FIFTHS
        0x215A: 0.83333333,  # ; ; 5/6 # No       VULGAR FRACTION FIVE SIXTHS
        0x215E: 0.875,  # ; ; 7/8 # No       VULGAR FRACTION SEVEN EIGHTHS
    }
    
    pattern = r'(?u)([+-])?(\d*)(\s*)(%s)' % '|'.join(map(chr, fractions))

    for sign, d, space, f in re.findall(pattern, utf):
        sign = -1 if sign == '-' else 1
        d = int(d) if d else 0
        number = sign * (d + fractions[ord(f)])
        return number

    return utf

In [4]:
# Preprocess Target data
ip_file_dir = "../Data/Target Data/"
group10 = pd.read_csv(os.path.join(ip_file_dir, 'group10_header.csv'),
                      sep='\t', low_memory=False)
op_file_path = os.path.join(ip_file_dir, 'scraped/products.csv')
products = pd.read_csv(op_file_path)
group10 = pd.merge(group10, products, how = 'left', on = 'tcin')
group10['title_lower'] = group10['title'].str.lower().str.split('-').str[0]
group10 = group10[~pd.isnull(group10['title_lower'])]
group10['title_lower_stemmed']= group10['title_lower'].apply(stem_ingredient)

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
q = "cake" # Name of food or, ingredients to search by

In [7]:
# Get a recipe
headers = {
    'x-rapidapi-key': "70fcf77facmshaf5555f7be26d0cp11fe0ejsn74b399108a64",
    'x-rapidapi-host': "tasty.p.rapidapi.com"
    }

offset = 0
size = 5

ingredient_list = {'shopping_list': []}
url = "https://tasty.p.rapidapi.com/recipes/list"
querystring = {"from": f"{str(offset)}","size":f"{str(size)}","q":f"{q}"}
response = requests.request("GET", url, headers=headers, params=querystring)
response_json = response.json().items()

id = []
for recipe in list(response_json)[1][1]:
#     print(recipe.get('id'))
    id.append(recipe.get('id'))
    
# retrieve the first recipe for now.
id_querystring = {"id":id[0]}
url = "https://tasty.p.rapidapi.com/recipes/detail"

response = requests.request("GET", url, headers=headers, params=id_querystring)
recipe = response.json()
recipe.get('name')

for ingredient in recipe.get('sections')[0].get('components'):
    name = ingredient.get('ingredient').get('name').strip()
    quantity = ingredient.get('measurements')[0].get('quantity').strip()
    
    if(ingredient.get('measurements')[0].get('unit').get('name') != ''):
        unit = ingredient.get('measurements')[0].get('unit').get('name').strip()
    else:
        unit = ''
    
    ingredient_json = json.loads(
                        json.dumps({'ingredient': name, 
                                    'quantity': (quantity), 
                                    'unit':unit}, 
                                   ensure_ascii=False))
    
    ingredient_list["shopping_list"].append(ingredient_json)

In [8]:
# Preprocess ingredients
ingredient_df = df = pd.json_normalize(ingredient_list['shopping_list'])
ingredient_df = ingredient_df.drop_duplicates().reset_index()
ingredient_df['ingredient'] = ingredient_df['ingredient'].str.lower()
ingredient_df['quantity'] = ingredient_df['quantity'].apply(convert_utf_fraction)

In [82]:
ingredient_df['exact'] = ingredient_df['ingredient'].apply(search_exact)
ingredient_df['stem'] = ingredient_df['ingredient'].apply(search_stem)
ingredient_df['hypernym'] = ingredient_df['ingredient'].apply(search_hypernym)
ingredient_df['hyponyms'] = ingredient_df['ingredient'].apply(search_hyponyms)
ingredient_df['noun'] = ingredient_df['ingredient'].apply(search_noun)
ingredient_df['noun_multiple'] = ingredient_df['ingredient'].apply(search_noun_multiple)
ingredient_df['longest_match_backward'] = ingredient_df['ingredient'].apply(lambda x: longest_match(x, 'backward'))
ingredient_df['longest_match_forward'] = ingredient_df['ingredient'].apply(lambda x: longest_match(x, 'forward'))
ingredient_df['noun_multiple_combined'] = ingredient_df['noun_multiple'].apply(flatten_dict)

In [83]:
ingredient_df['combined'] = ingredient_df['exact'] + \
                            ingredient_df['stem'] + \
                            ingredient_df['longest_match_backward'] + \
                            ingredient_df['longest_match_forward'] + \
                            ingredient_df['noun'] + \
                            ingredient_df['noun_multiple_combined'] + \
                            ingredient_df['hypernym'] + \
                            ingredient_df['hyponyms'] 

ingredient_df['results'] = ingredient_df['combined'].apply(lambda x: list(dict.fromkeys(x)))


In [84]:
ingredient_df

Unnamed: 0,index,ingredient,quantity,unit,exact,stem,hypernym,noun,noun_multiple,longest_match_backward,longest_match_forward,noun_multiple_combined,hyponyms,combined,results
0,0,nonstick cooking spray,0.0,,[],[],[],[],{},[],[],[],[],[],[]
1,1,chocolate sandwich cookie,1.0,package,[],"[oreo chocolate sandwich cookies , oreo chocol...",[],"[white sandwich bread , sunbeam sandwich rolls...",{},"[milk chocolate, klik chocolate, oatly chocola...",[],[],[],"[oreo chocolate sandwich cookies , oreo chocol...","[oreo chocolate sandwich cookies , oreo mini c..."
2,2,unsalted butter,0.75,stick,"[unsalted butter , unsalted butter , challenge...","[unsalted butter , unsalted butter , challenge...",[],"[cashew butter , salted butter , salted butter...",{},[],"[cashew butter , salted butter , salted butter...",[],[],"[unsalted butter , unsalted butter , challenge...","[unsalted butter , challenge unsalted butter ,..."
3,4,cream cheese,32.0,ounce,"[plain cream cheese , plain cream cheese bar ,...","[plain cream cheese , plain cream cheese bar ,...","[swiss cheese , cheese danish , asiago cheese ...",[],"{'cream': ['sour cream ', 'sour cream ', 'sour...","[sour cream , sour cream , sour cream , a&w cr...","[swiss cheese , cheese danish , asiago cheese ...","[sour cream , sour cream , sour cream , a&w cr...","[double cream brie soft ripened cheese round ,...","[plain cream cheese , plain cream cheese bar ,...","[plain cream cheese , plain cream cheese bar ,..."
4,5,skippy® creamy peanut butter,1.0,cup,[],[],[],[],"{'peanut': ['peanut oil ', 'louana peanut oil ...",[],"[cashew butter , salted butter , salted butter...","[peanut oil , louana peanut oil , thai peanut ...",[],"[cashew butter , salted butter , salted butter...","[cashew butter , salted butter , unsalted butt..."
5,6,heavy cream,0.5,cup,[hood heavy cream ],[hood heavy cream ],"[sour cream , sour cream , sour cream , a&w cr...","[sour cream , sour cream , sour cream , a&w cr...",{},[],"[sour cream , sour cream , sour cream , a&w cr...",[],[],"[hood heavy cream , hood heavy cream , sour cr...","[hood heavy cream , sour cream , a&w cream sod..."
6,7,vanilla extract,1.0,teaspoon,"[pure vanilla extract , pure vanilla extract ,...","[pure vanilla extract , pure vanilla extract ,...","[starbucks flavored k, starbucks flavored k, s...",[],"{'vanilla': ['pepsi vanilla ', 'oatly vanilla ...","[pepsi vanilla , oatly vanilla non, vanilla oa...",[],"[pepsi vanilla , oatly vanilla non, vanilla oa...",[],"[pure vanilla extract , pure vanilla extract ,...","[pure vanilla extract , imitation vanilla extr..."
7,8,powdered sugar,1.5,cup,"[powdered sugar , crystal powdered sugar , org...","[powdered sugar , crystal powdered sugar , org...","[granulated sugar , granulated sugar , crystal...","[sugar, sugar, sugar, sugar, sugar, sugar, sug...",{},[],"[sugar, sugar, sugar, sugar, sugar, sugar, sug...",[],"[extra polar ice sugar, extra polar ice sugar]","[powdered sugar , crystal powdered sugar , org...","[powdered sugar , crystal powdered sugar , org..."
8,9,kosher salt,0.25,teaspoon,[morton coarse kosher salt ],[morton coarse kosher salt ],[],"[plain salt , morton salt , garlic salt , cele...",{},[],"[plain salt , morton salt , garlic salt , cele...",[],[],"[morton coarse kosher salt , morton coarse kos...","[morton coarse kosher salt , plain salt , mort..."
9,10,semisweet chocolate chips,0.25,cup,[],[],[],[],"{'chocolate': ['milk chocolate', 'klik chocola...",[],"[pecan chips , banana chips , carrot chips , d...","[milk chocolate, klik chocolate, oatly chocola...",[],"[pecan chips , banana chips , carrot chips , d...","[pecan chips , banana chips , carrot chips , d..."


In [85]:
ing_matches = ingredient_df[['ingredient', 'results']]
# json_str = json.loads(ingredient_df.set_index('ingredient').to_json())
json_str = json.loads(ing_matches.set_index('ingredient').to_json())

In [86]:
print(json.dumps(json_str, indent=4))

{
    "results": {
        "nonstick cooking spray": [],
        "chocolate sandwich cookie": [
            "oreo chocolate sandwich cookies ",
            "oreo mini chocolate sandwich cookies ",
            "oreo thins chocolate sandwich cookies ",
            "oreo original chocolate sandwich cookies ",
            "oreo double stuf chocolate sandwich cookies ",
            "oreo chocolate sandwich cookies family size ",
            "milk chocolate",
            "klik chocolate",
            "oatly chocolate non",
            "chocolate crackers ",
            "jonnypops chocolate",
            "chocolate croissant ",
            "chocolate ice cream ",
            "rolo chocolate candy ",
            "white sandwich bread ",
            "sunbeam sandwich rolls ",
            "mrs. baird's sandwich bread ",
            "ham and swiss half sandwich ",
            "sunbeam thin sandwich bread ",
            "love's sandwich white bread ",
            "kosher dill sandwich slices ",
  

In [None]:
# plu = pd.read_csv('data/plu_codes.csv')
# plu['name'] = plu['Name'].str.lower()
# plu['stemmed_name']= plu['name'].apply(stem_ingredient)
# ingredient_df['has_plu_code'] = ingredient_df['ingredient'].apply(has_plu_code)