In [176]:
import pandas as pd 
import numpy as np
import requests
import json
import re
import os
import sys
import spacy
import string
import random

from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=sys.maxsize)

In [177]:
# Search functions
def flatten_dict(d):
    t = [v for (k,v) in d.items()]
    return [item for sublist in t for item in sublist]

def simple_length_ranker(product_matches):
    return sorted(product_matches, key=len)

def search_stem(ingredient):
    stemmed_ingredient = stem_ingredient(ingredient)
    product_matches = []
    for product, product_stem in zip(group10['title_lower'], group10['title_lower_stemmed']):
        try:
            if bool(re.search(fr'\b{stemmed_ingredient}\b', product_stem)):
                product_matches.append(product)
        except:
            pass
    product_matches = simple_length_ranker(product_matches)
    return product_matches[0:10]

def search_exact(ingredient):
    product_matches = []
    for product in group10['title_lower']:
        try:
            if bool(re.search(fr'\b{ingredient}\b', product)):
                product_matches.append(product)
        except:
            pass
    product_matches = simple_length_ranker(product_matches)
    return product_matches[0:10]

def get_hypernym(ingredient):
    if ' ' in ingredient:
        ingredient = ingredient.replace(' ', '_')
    hypernym = ''
    try:
        synset = wn.synsets(ingredient)[0]
        hypernym = synset.hypernyms()[0].lemma_names()[0]
    except:
        pass
    if is_food(hypernym):
        return hypernym.replace('_', ' ')
    return hypernym

def search_hypernym(ingredient):
    product_matches = []
    hypernym = get_hypernym(ingredient)
    if hypernym:
        product_matches = search_exact(hypernym)
        if len(product_matches) == 0:
            product_matches = search_stem(hypernym)
    return product_matches

def get_hyponyms(ingredient):
    if ' ' in ingredient:
        ingredient = ingredient.replace(' ', '_')
    hyponym_list = []
    try:
        synsets = wn.synsets(ingredient)
        for synset in synsets:
            hyponyms = synset.hyponyms()
            for hyponym in hyponyms:
                hyponym_list += hyponym.lemma_names()
        hyponym_list = [h.replace('_', ' ') for h in hyponym_list if is_food(h)]
    except:
        pass
    return list(set(hyponym_list))

def search_hyponyms(ingredient):
    combined_product_matches = []
    hyponym_list = get_hyponyms(ingredient)
    if len(hyponym_list) > 0:
        for hyponym in hyponym_list:
            product_matches = []
            product_matches = search_exact(hyponym)
            if len(product_matches) == 0:
                product_matches = search_stem(hyponym)
            combined_product_matches += product_matches
    if len(combined_product_matches) > 10: 
        return random.sample(combined_product_matches, 10)
    return combined_product_matches             

def get_noun(ingredient):
    doc = nlp(ingredient)
    return " ".join([token.text for token in doc if token.pos_ == "NOUN"])

def search_noun(ingredient):
    product_matches = []
    noun = get_noun(ingredient)
    if noun:
        if is_food(noun):
            product_matches = search_exact(noun)
            if len(product_matches) == 0:
                product_matches = search_stem(noun)
    return product_matches

def is_food(ingredient):
    synsets = wn.synsets(ingredient)
    for synset in synsets:
        if 'food' in synset.lexname():
            return True
        
def search_noun_multiple(ingredient):
    noun = get_noun(ingredient)
    nouns = {}
    if noun:
        if ' ' in noun:
            for s in noun.split():
                if is_food(s):
                    nouns[s] = search_exact(s)
                    if len(nouns[s]) == 0:
                        nouns[s] = search_stem(s)
            return nouns
    return nouns

def stem_ingredient(ingredient):
    stemmer = PorterStemmer()
    stemmed = " ".join([stemmer.stem(w) for w in ingredient.split()])
    return stemmed

def has_plu_code(ingredient):
    stemmed_ingredient = stem_ingredient(ingredient)
    if stemmed_ingredient in plu['stemmed_name'].values:
        return True
    if ingredient in plu['name'].values:
        return True
    return False

def longest_match(ingredient, direction='backward'):
    product_matches = []
    split_ingredient = ingredient.split()
    while len(split_ingredient) > 1:
        if direction=='backward':
            split_ingredient = split_ingredient[:-1]
        elif direction=='forward':
            split_ingredient = split_ingredient[1:]
        ingredient = " ".join(split_ingredient)
        noun = get_noun(ingredient)
        if noun:
            if is_food(noun):
                product_matches = search_exact(ingredient)
                if len(product_matches) == 0:
                    product_matches = search_stem(ingredient)
                if len(product_matches) > 0:
                    return product_matches
    return product_matches

In [178]:
def convert_utf_fraction(utf): 
    fractions = {
        0x2189: 0.0,  # ; ; 0 # No       VULGAR FRACTION ZERO THIRDS
        0x2152: 0.1,  # ; ; 1/10 # No       VULGAR FRACTION ONE TENTH
        0x2151: 0.11111111,  # ; ; 1/9 # No       VULGAR FRACTION ONE NINTH
        0x215B: 0.125,  # ; ; 1/8 # No       VULGAR FRACTION ONE EIGHTH
        0x2150: 0.14285714,  # ; ; 1/7 # No       VULGAR FRACTION ONE SEVENTH
        0x2159: 0.16666667,  # ; ; 1/6 # No       VULGAR FRACTION ONE SIXTH
        0x2155: 0.2,  # ; ; 1/5 # No       VULGAR FRACTION ONE FIFTH
        0x00BC: 0.25,  # ; ; 1/4 # No       VULGAR FRACTION ONE QUARTER
        0x2153: 0.33333333,  # ; ; 1/3 # No       VULGAR FRACTION ONE THIRD
        0x215C: 0.375,  # ; ; 3/8 # No       VULGAR FRACTION THREE EIGHTHS
        0x2156: 0.4,  # ; ; 2/5 # No       VULGAR FRACTION TWO FIFTHS
        0x00BD: 0.5,  # ; ; 1/2 # No       VULGAR FRACTION ONE HALF
        0x2157: 0.6,  # ; ; 3/5 # No       VULGAR FRACTION THREE FIFTHS
        0x215D: 0.625,  # ; ; 5/8 # No       VULGAR FRACTION FIVE EIGHTHS
        0x2154: 0.66666667,  # ; ; 2/3 # No       VULGAR FRACTION TWO THIRDS
        0x00BE: 0.75,  # ; ; 3/4 # No       VULGAR FRACTION THREE QUARTERS
        0x2158: 0.8,  # ; ; 4/5 # No       VULGAR FRACTION FOUR FIFTHS
        0x215A: 0.83333333,  # ; ; 5/6 # No       VULGAR FRACTION FIVE SIXTHS
        0x215E: 0.875,  # ; ; 7/8 # No       VULGAR FRACTION SEVEN EIGHTHS
    }
    
    pattern = r'(?u)([+-])?(\d*)(\s*)(%s)' % '|'.join(map(chr, fractions))

    for sign, d, space, f in re.findall(pattern, utf):
        sign = -1 if sign == '-' else 1
        d = int(d) if d else 0
        number = sign * (d + fractions[ord(f)])
        return number

    return utf

In [179]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [180]:
def preprocess1(title):
    title = title.lower()
    regex = re.compile(r'[+\r\t\n]')
    title = regex.sub("", title)
    title = " ".join([w for w in title.split() if len(w) > 1])
    return title

In [181]:
def preprocess2(title):
    regex = re.compile(r'[' + string.punctuation + '0-9\r\t\n]')
    title = regex.sub("", title)
    title = " ".join([w for w in title.split() if len(w) > 2])
    quantities = 'oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z'
    title = re.sub(fr'\b({quantities})?({quantities})?\b', '', title)
    return title

In [182]:
def remove_brand(title):
    result = re.sub(fr'\b({brands})\b', '', f'{title}')
    result = " ".join([w for w in result.split() if len(w) > 2])
    return result

In [183]:
# Preprocess Target data
ip_file_dir = "../Data/Target Data/"
group10 = pd.read_csv(os.path.join(ip_file_dir, 'group10_header.csv'),
                      sep='\t', low_memory=False)
op_file_path = os.path.join(ip_file_dir, 'scraped/products.csv')
products = pd.read_csv(op_file_path)
group10 = pd.merge(group10, products, how = 'left', on = 'tcin')
group10 = group10[~pd.isnull(group10['title'])]
group10['title_lower'] = group10['title'].str.lower()
group10['title_lower'] = group10['title_lower'].apply(lambda x: re.sub(r'[0-9]+(.)?([0-9]+)?[\s]*(cans|can|boxes|box|bottles|bottle|gallons|gallon|fl oz|oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z|l|g)\b', '', x))
regex = re.compile(r'[' + string.punctuation + '0-9\r\t\n]')
group10['title_lower'] = group10['title_lower'].apply(lambda x: regex.sub("", x))
group10['title_lower'] = group10['title_lower'].apply(lambda x: re.sub("[\s]+", " ", x))
group10['title_lower_stemmed']= group10['title_lower'].apply(stem_ingredient)

# group10['title_lower'] = group10['title'].apply(preprocess)
# group10['brand_lower'] = group10['brand'].apply(preprocess)
# brands = "|".join(group10['brand_lower'].unique())
# group10['title_brand_removed'] = group10['title_lower'].apply(remove_brand)
# group10['title_lower'] = group10['title_brand_removed'].apply(preprocess2)
# group10['title_lower_stemmed']= group10['title_lower'].apply(stem_ingredient)

# group10['title_lower'].apply(lambda x: re.sub(r'(-|/)[\s]*[0-9]+(.)?([0-9]+)?[\s]*(fl oz|oz|fl|gal|pk|ct|ml)', '', x))

In [184]:
# group10['title_lower_qty_removed'] = group10['title_lower'].apply(lambda x: re.sub(r'[0-9]+(.)?([0-9]+)?[\s]*(cans|can|boxes|box|bottles|bottle|gallons|gallon|fl oz|oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z|l|g)\b', '', x))
# group10['title_lower_qty_removed_'] = group10['title_lower_qty_removed'].apply(lambda x: re.sub(r'[-/]', '', x))

In [185]:
# products_str = group10['title_lower'].str.cat(sep=' ')
# from collections import Counter
# ctr = Counter(products_str.split())
# for word, count in ctr.most_common(300):
#     print(word + " " + str(count))

In [186]:
nlp = spacy.load('en_core_web_sm')

In [187]:
q = "chicken biryani" # Name of food or, ingredients to search by

In [188]:
# Get a recipe
headers = {
    'x-rapidapi-key': "70fcf77facmshaf5555f7be26d0cp11fe0ejsn74b399108a64",
    'x-rapidapi-host': "tasty.p.rapidapi.com"
    }

offset = 0
size = 5

ingredient_list = {'shopping_list': []}
url = "https://tasty.p.rapidapi.com/recipes/list"
querystring = {"from": f"{str(offset)}","size":f"{str(size)}","q":f"{q}"}
response = requests.request("GET", url, headers=headers, params=querystring)
response_json = response.json().items()

id = []
for recipe in list(response_json)[1][1]:
#     print(recipe.get('id'))
    id.append(recipe.get('id'))
    
# retrieve the first recipe for now.
id_querystring = {"id":id[0]}
url = "https://tasty.p.rapidapi.com/recipes/detail"

response = requests.request("GET", url, headers=headers, params=id_querystring)
recipe = response.json()
recipe.get('name')

for ingredient in recipe.get('sections')[0].get('components'):
    name = ingredient.get('ingredient').get('name').strip()
    quantity = ingredient.get('measurements')[0].get('quantity').strip()
    
    if(ingredient.get('measurements')[0].get('unit').get('name') != ''):
        unit = ingredient.get('measurements')[0].get('unit').get('name').strip()
    else:
        unit = ''
    
    ingredient_json = json.loads(
                        json.dumps({'ingredient': name, 
                                    'quantity': (quantity), 
                                    'unit':unit}, 
                                   ensure_ascii=False))
    
    ingredient_list["shopping_list"].append(ingredient_json)

In [189]:
# Preprocess ingredients
ingredient_df = df = pd.json_normalize(ingredient_list['shopping_list'])
ingredient_df = ingredient_df.drop_duplicates().reset_index()
ingredient_df['ingredient'] = ingredient_df['ingredient'].str.lower()
ingredient_df['quantity'] = ingredient_df['quantity'].apply(convert_utf_fraction)

In [190]:
ingredient_df['exact'] = ingredient_df['ingredient'].apply(search_exact)
ingredient_df['stem'] = ingredient_df['ingredient'].apply(search_stem)
ingredient_df['hypernym'] = ingredient_df['ingredient'].apply(search_hypernym)
ingredient_df['hyponyms'] = ingredient_df['ingredient'].apply(search_hyponyms)
ingredient_df['noun'] = ingredient_df['ingredient'].apply(search_noun)
ingredient_df['noun_multiple'] = ingredient_df['ingredient'].apply(search_noun_multiple)
ingredient_df['longest_match_backward'] = ingredient_df['ingredient'].apply(lambda x: longest_match(x, 'backward'))
ingredient_df['longest_match_forward'] = ingredient_df['ingredient'].apply(lambda x: longest_match(x, 'forward'))
ingredient_df['noun_multiple_combined'] = ingredient_df['noun_multiple'].apply(flatten_dict)

In [191]:
ingredient_df['combined'] = ingredient_df['exact'] + \
                            ingredient_df['stem'] + \
                            ingredient_df['longest_match_backward'] + \
                            ingredient_df['longest_match_forward'] + \
                            ingredient_df['noun'] + \
                            ingredient_df['noun_multiple_combined'] + \
                            ingredient_df['hypernym'] + \
                            ingredient_df['hyponyms'] 

ingredient_df['results'] = ingredient_df['combined'].apply(lambda x: list(dict.fromkeys(x)))


In [192]:
ingredient_df

Unnamed: 0,index,ingredient,quantity,unit,exact,stem,hypernym,hyponyms,noun,noun_multiple,longest_match_backward,longest_match_forward,noun_multiple_combined,combined,results
0,0,large onion,2.0,,[],[],[],[],"[red onion each, white onion each, yellow onio...",{},[],"[red onion each, white onion each, yellow onio...",[],"[red onion each, white onion each, yellow onio...","[red onion each, white onion each, yellow onio..."
1,1,medium tomato,2.0,,[],[],[],[],[],"{'tomato': ['roma tomato bag', 'goya tomato sa...",[],"[roma tomato bag, goya tomato sauce , raos tom...","[roma tomato bag, goya tomato sauce , raos tom...","[roma tomato bag, goya tomato sauce , raos tom...","[roma tomato bag, goya tomato sauce , raos tom..."
2,2,ginger garlic paste,2.0,tablespoon,[],[],[],[],[],"{'ginger': ['jfc sushi ginger ', 'sprite ginge...","[jfc sushi ginger , sprite ginger cans, yogi t...",[],"[jfc sushi ginger , sprite ginger cans, yogi t...","[jfc sushi ginger , sprite ginger cans, yogi t...","[jfc sushi ginger , sprite ginger cans, yogi t..."
3,3,green chili paste,0.5,teaspoon,[],[],[],[],[],"{'chili': ['heinz chili sauce ', 'badia chili ...","[bueno green chili chicken frozen enchiladas ,...",[huy fong chili paste ],"[heinz chili sauce , badia chili powder , ruff...","[bueno green chili chicken frozen enchiladas ,...","[bueno green chili chicken frozen enchiladas ,..."
4,4,fresh cilantro,1.0,cup,[],[],[],[],"[cilantro bunch each, bitchin cilantro sauce ,...",{},[],[],[],"[cilantro bunch each, bitchin cilantro sauce ,...","[cilantro bunch each, bitchin cilantro sauce ,..."
5,5,fresh mint,0.25,cup,"[mentos fresh mint gum , mentos fresh mint che...","[mentos fresh mint gum , mentos fresh mint che...",[],[],"[organic mint package, mentos fresh mint gum ,...",{},[],"[organic mint package, mentos fresh mint gum ,...",[],"[mentos fresh mint gum , mentos fresh mint che...","[mentos fresh mint gum , mentos fresh mint che..."
6,6,turmeric powder,0.5,teaspoon,[swad turmeric powder ],[swad turmeric powder ],[],[],[],{},[],[],[],"[swad turmeric powder , swad turmeric powder ]",[swad turmeric powder ]
7,7,curry powder,2.0,tablespoon,"[curry powder good gather™, badia spices curry...","[curry powder good gather™, badia spices curry...","[jason flavored bread crumbs , funyuns onion f...",[],[],{},[],[],[],"[curry powder good gather™, badia spices curry...","[curry powder good gather™, badia spices curry..."
8,8,coriander powder,0.5,teaspoon,[],[],[],[],[],{},[],[],[],[],[]
9,9,yogurt,1.0,tablespoon,"[oui coffee yogurt , old home plain yogurt , n...","[oui coffee yogurt , old home plain yogurt , n...",[],"[kemps moose tracks frozen yogurt , kemps low ...",[],{},[],[],[],"[oui coffee yogurt , old home plain yogurt , n...","[oui coffee yogurt , old home plain yogurt , n..."


In [193]:
# json_str = json.loads(ing_matches.set_index('ingredient').to_json())
# print(json.dumps(json_str, indent=4))

# plu = pd.read_csv('data/plu_codes.csv')
# plu['name'] = plu['Name'].str.lower()
# plu['stemmed_name']= plu['name'].apply(stem_ingredient)
# ingredient_df['has_plu_code'] = ingredient_df['ingredient'].apply(has_plu_code)

# from strsimpy.levenshtein import Levenshtein
# from strsimpy.damerau import Damerau
# from strsimpy.optimal_string_alignment import OptimalStringAlignment
# from strsimpy.metric_lcs import MetricLCS
# from strsimpy.ngram import NGram
# from strsimpy.qgram import QGram

# qgram = QGram(2)
# metric_lcs = MetricLCS()
# optimal_string_alignment = OptimalStringAlignment()
# levenshtein = Levenshtein()
# damerau = Damerau()

# def get_bigrams(string):
#     '''
#     Takes a string and returns a list of bigrams
#     '''
#     s = string.lower()
#     return {s[i:i+2] for i in range(len(s) - 1)}

# def string_similarity(str1, str2):
#     '''
#     Perform bigram comparison between two strings
#     and return a percentage match in decimal form
#     '''
#     pairs1 = get_bigrams(str1)
#     pairs2 = get_bigrams(str2)
#     return (2.0 * len(pairs1 & pairs2)) / (len(pairs1) + len(pairs2))

In [194]:
import rapidfuzz
from rapidfuzz import fuzz
from rapidfuzz.string_metric import levenshtein
def string_matching_score(x, f):
    query = x[0]
    matches = x[1]
    match_score = {}
    for match in matches:
        match_score[match] = f(query, match)
    return sorted(match_score.items(), key = lambda x: x[1], reverse=False)

# Levenshtein
ing_matches = ingredient_df[['ingredient', 'results']].copy()
ing_matches['results_sorted'] = ing_matches.apply(lambda x: string_matching_score(x, levenshtein), axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,"(red onion each, 9)"
1,large onion,"(white onion each, 9)"
2,large onion,"(yellow onion each, 11)"
3,large onion,"(osem bissli onion , 12)"
4,large onion,"(badia onion powder , 12)"
5,large onion,"(lays french onion dip , 13)"
6,large onion,"(deans french onion dip , 15)"
7,large onion,"(mccormick onion powder , 16)"
8,medium tomato,"(roma tomato bag, 10)"
9,medium tomato,"(iberia tomato sauce , 12)"


In [260]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# embedder = SentenceTransformer('average_word_embeddings_glove.6B.300d')
# embedder = SentenceTransformer('all-mpnet-base-v2')

In [261]:
from torch.nn import PairwiseDistance
def vector_space_score(x, distance):
    ingredient = x[0]
    match_score = {}
    if len(matches) > 0:
        ingredient_embedding = embedder.encode(ingredient, convert_to_tensor=True)
        if distance == 'cosine':
            scores = util.pytorch_cos_sim(ingredient_embedding, match_embeddings)[0]
            reverse = True
        else:
            pdist = PairwiseDistance(p=2)
            scores = pdist(ingredient_embedding, match_embeddings)
            reverse = False
    #     top_results = torch.topk(cos_scores, k=5)
        match_score= dict(zip(matches, scores.numpy()))
        match_score = sorted(match_score.items(), key = lambda x: x[1], reverse=reverse)
    return match_score[0:100]

In [262]:
matches = group10['title_lower'].values
match_embeddings = embedder.encode(matches, convert_to_tensor=True)

In [263]:
ing_matches['results_sorted'] = ing_matches.apply(lambda x: vector_space_score(x, 'cosine'), axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,"(white onion each, 0.6979088)"
1,large onion,"(red onion each, 0.6844281)"
2,large onion,"(yellow onion each, 0.67557836)"
3,large onion,"(organic green onion bunch , 0.6321119)"
4,large onion,"(white onions diced , 0.6200611)"
5,large onion,"(fritolay maui style onion , 0.6037526)"
6,large onion,"(green onions bag, 0.5986353)"
7,large onion,(sensible portions veggie puffs sour cream oni...
8,large onion,"(organic yellow onion bag, 0.588923)"
9,large onion,"(mccormick chopped onions , 0.5875834)"


In [198]:
def load_glove(filename):
    """
    Read all lines from the indicated file and return a dictionary
    mapping word:vector where vectors are of numpy `array` type.
    GloVe file lines are of the form:

    the 0.418 0.24968 -0.41242 0.1217 ...

    So split each line on spaces into a list; the first element is the word
    and the remaining elements represent factor components. The length of the vector
    should not matter; read vectors of any length.

    When computing the vector for each document, use just the text, not the text and title.
    """
    d = {}
    with open(filename, mode = 'r') as f:
        for line in f.readlines():
            elements = line.split(' ') 
            d[elements[0]] = np.array(elements[1:], dtype='float64')
    return d

In [206]:
def doc2vec(text, gloves):
    """
    Return the word vector centroid for the text. Sum the word vectors
    for each word and then divide by the number of words. Ignore words
    not in gloves.
    """
    text_words = text.split()
    word_count = 0
    centroid = [0]*300
    for word in text_words:
        if word in gloves.keys():
            centroid += gloves[word]
            word_count += 1
    if word_count > 0:
        centroid = centroid / word_count
    return centroid

In [200]:
def distances(ingredient_vector, match_vectors):
    """
    Compute the euclidean distance from article to every other article and return
    a list of (distance, a) tuples for all a in articles. The article is one
    of the elements (tuple) from the articles list.
    """
    distances_list = []
    for match_vector in match_vectors:
        diff = ingredient_vector - match_vector
        distance = np.sqrt(diff @ diff)
        distances_list.append(distance)
    return distances_list

In [203]:
def vector_space_score_glove(x):
#     matches = x[1]
    ingredient = x[0]
    match_score = {}
    if len(matches) > 0:
        ingredient_vector = doc2vec(ingredient, glove_vectors)
        pair_distances = distances(ingredient_vector, match_vectors)
        match_score = dict(zip(matches, pair_distances))
        match_score = sorted(match_score.items(), key = lambda x: x[1], reverse=False)
    return match_score[0:100]

In [202]:
glove_vectors = load_glove('/Users/mvellera/data/glove.6B.300d.txt')

In [207]:
# Glove
matches = group10['title_lower'].values
match_vectors = np.array([doc2vec(match, glove_vectors) for match in matches])
ing_matches['results_sorted'] = ing_matches.apply(vector_space_score_glove, axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,(triscuit garlic and onion with poppyseeds gra...
1,large onion,"(red onion each, 3.4233560149717426)"
2,large onion,(diced tomatoes with roasted garlic and onion ...
3,large onion,"(wise onion garlic potato chips , 3.5121070648..."
4,large onion,"(white onion each, 3.5206920514764586)"
5,large onion,(mccormick garlic onion black pepper sea salt ...
6,large onion,"(yellow onion each, 3.549252968042831)"
7,large onion,"(mezzetta caramelized onion and butter , 3.631..."
8,large onion,"(minced onion good gather™, 3.6529747520710134)"
9,large onion,"(frozen onion rings market pantry™, 3.69613671..."


In [209]:
def tokenizer(text):
    return list(set(text.split()))

In [210]:
from sklearn.feature_extraction.text import TfidfVectorizer
def compute_tfidf(corpus):
    """
    Create and return a TfidfVectorizer object after training it on
    the list of articles pulled from the corpus dictionary. Meaning,
    call fit() on the list of document strings, which figures out
    all the inverse document frequencies (IDF) for use later by
    the transform() function. The corpus argument is a dictionary 
    mapping file name to xml text.
    """
    tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            tokenizer=tokenizer,
                            stop_words='english', # even more stop words
                            decode_error = 'ignore')
    tfidf.fit(corpus)
    return tfidf

In [212]:
from sklearn.metrics import pairwise_distances
def vector_space_score_tfidf(x):
#     matches = x[1]
    ingredient = x[0]
    match_score = {}
    if len(matches) > 0:
        ingredient_tfidf = tfidf_model.transform([ingredient])
        pair_distances = pairwise_distances(ingredient_tfidf, matches_tfidf)[0]
        match_score = dict(zip(matches, pair_distances))
        match_score = sorted(match_score.items(), key = lambda x: x[1], reverse=False)[0:100]
    return match_score

In [213]:
matches = group10['title_lower'].values
tfidf_model = compute_tfidf(matches)
matches_tfidf = tfidf_model.transform(matches)
ing_matches['results_sorted'] = ing_matches.apply(vector_space_score_tfidf, axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,"(white onion each, 0.9198240859639971)"
1,large onion,"(red onion each, 0.9205482523805198)"
2,large onion,"(yellow onion each, 1.0184477277173856)"
3,large onion,(pringles grab go large sour cream onion potat...
4,large onion,"(butternut large bread , 1.0800912388319948)"
5,large onion,"(onion powder good gather™, 1.089142554988756)"
6,large onion,(one potato two potato sweet onion potato chip...
7,large onion,"(quaker large rice cake everything , 1.1046773..."
8,large onion,"(organic yellow onion bag, 1.1076065704976434)"
9,large onion,"(mccormick onion powder , 1.1085673042071884)"


In [215]:
from sklearn.metrics import pairwise_distances
def vector_space_score_tfidf_ingredients(x):
    ingredient = x[0]
    match_score = {}
    if len(matches) > 0:
        ingredient_tfidf = tfidf_model.transform([ingredient])
        pair_distances = pairwise_distances(ingredient_tfidf, matches_tfidf)[0]
        match_score = dict(zip(group10['title_lower'].values, pair_distances))
        match_score = sorted(match_score.items(), key = lambda x: x[1], reverse=False)[0:100]
    return match_score

In [217]:
# TF-IDF using Ingredients
matches = [re.sub(r'[()\[\]]', '', str(value)) for value in group10['ingredients'].values]
tfidf_model = compute_tfidf(matches)
matches_tfidf = tfidf_model.transform(matches)
ing_matches['results_sorted'] = ing_matches.apply(vector_space_score_tfidf_ingredients, axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,"(dry large lima beans good gather™, 0.87415164..."
1,large onion,"(white onions diced , 1.1350909638359494)"
2,large onion,"(cut fruit express diced yellow onions , 1.224..."
3,large onion,"( bean soup starter good gather™, 1.2346634089..."
4,large onion,"(rib rack cajun spice rub , 1.3256610106054862)"
5,large onion,(frozen sauteed chopped onion cubes good gathe...
6,large onion,"(la preferida low fat refried beans , 1.331340..."
7,large onion,"(organic ketchup good gather™, 1.3355845747026..."
8,large onion,(true story organic thick cut oven roasted chi...
9,large onion,"(muir glen organic tomato sauce , 1.3383829516..."


In [218]:
import nltk
def tokenize(text):
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if len(w) > 2]  # ignore a, an, to, at, be, ...
    return tokens

In [219]:
def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

In [220]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
def tokenizer(text):
    tokens = tokenize(text)
    tokenized_words = [t for t in tokens if t not in ENGLISH_STOP_WORDS] 
    return stemwords(tokenized_words)

In [221]:
def compute_tfidf(corpus):
    """
    Create and return a TfidfVectorizer object after training it on
    the list of articles pulled from the corpus dictionary. Meaning,
    call fit() on the list of document strings, which figures out
    all the inverse document frequencies (IDF) for use later by
    the transform() function. The corpus argument is a dictionary 
    mapping file name to xml text.
    """
    tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            tokenizer=tokenizer,
                            stop_words='english', # even more stop words
                            decode_error = 'ignore')
    tfidf.fit(corpus)
    return tfidf


In [228]:
from sklearn.metrics import pairwise_distances
def vector_space_score_tfidf_description(x):
    ingredient = x[0]
    match_score = {}
    if len(matches) > 0:
        ingredient_tfidf = tfidf_model.transform([ingredient])
        pair_distances = pairwise_distances(ingredient_tfidf, matches_tfidf)[0]
        match_score = dict(zip(group10['title_lower'].values, pair_distances))
        match_score = sorted(match_score.items(), key = lambda x: x[1], reverse=False)
    return match_score[0:100]

In [238]:
group10 = group10[~pd.isnull(group10['description']) & (group10['description'].str.len() > 10)]

In [239]:
matches = group10['description'].values
tfidf_model = compute_tfidf(matches)
matches_tfidf = tfidf_model.transform(matches)
ing_matches['results_sorted'] = ing_matches.apply(vector_space_score_tfidf_description, axis=1)
ing_matches_sorted = ing_matches[['ingredient', 'results_sorted']].copy()
ing_matches_sorted = ing_matches_sorted.explode('results_sorted').reset_index(drop=True)



In [240]:
ing_matches_sorted

Unnamed: 0,ingredient,results_sorted
0,large onion,"(mccormick onion powder , 1.066434686382491)"
1,large onion,"(mccormick chopped onions , 1.0827620005311156)"
2,large onion,"(sweet onions bag good gather™, 1.113913877425..."
3,large onion,"(minced onion good gather™, 1.114533143789374)"
4,large onion,"(yellow onions bag good gather™, 1.12114442091..."
5,large onion,"(bimbo mantecadas , 1.137435045003771)"
6,large onion,"(campbells condensed cream of onion soup , 1.1..."
7,large onion,"(onion powder good gather™, 1.1741414639355905)"
8,large onion,"(organic onion powder good gather™, 1.17854511..."
9,large onion,(frozen sauteed chopped onion cubes good gathe...
