In [30]:
# Loading all relevant libraries
%matplotlib inline

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import seaborn as sn
sn.set()

matplotlib.rc('font', **{'weight' : 'bold', 'size':10} )

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, recall_score
from sklearn import svm
import string
from nltk.stem import WordNetLemmatizer



In [36]:
# Helper function for normalizing strings
def normalize(s):
    lm = WordNetLemmatizer()
    printable = string.printable
    PERMITTED_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " 

    filter(lambda x: x in printable, s)
    s = "".join(c for c in s if c in PERMITTED_CHARS)
    s = lm.lemmatize(s.lower())

    return s  

In [42]:
# Read in ingredients
ing_df = pd.read_csv("ingredients.csv").set_index('ingredient')
ing_df.index = ing_df.index.map(normalize)

ing_df.groupby(ing_df.index.get_level_values(0)).sum().sort_values(by='num_recipes',ascending=False)

Unnamed: 0_level_0,num_recipes
ingredient,Unnamed: 1_level_1
salt,18049
olive oil,7972
onion,7972
water,7457
garlic,7380
sugar,6434
garlic cloves,6237
butter,4848
ground black pepper,4785
allpurpose flour,4632


In [44]:
# Given an ingredient, return all subsets of the ingredient string as a list of strings
def get_substrings(ingredient):
    # Base case - Return an empty list of the string is empty
    if ingredient == '':
        return []
    
    word_list = ingredient.split(' ')
    substring_list = []
    substring_list.append(ingredient)
    
    # Base case - Return a list with only the ingredient if there is only one word
    if len(word_list) == 1:
        return substring_list
    
    # Recursive Case    
    for i in range(len(word_list)):
        new_list = np.array(word_list, copy=True).tolist()
        new_list.remove(word_list[i])
        substring_word = ' '.join(new_list)
        substring_list.append(substring_word)
        
        sub_list = get_substrings(substring_word)
        for w in sub_list:
            substring_list.append(w)

    return substring_list

# Dedupes the output list of get_substrings function
def get_substrings_deduped(substring_list):
    return np.unique(get_substrings(substring_list)).tolist()

In [85]:
def broadmatch(ing):
    if len(ing.split(' ')) > 10:
        return ing
    
    df = pd.DataFrame(get_substrings_deduped(ing), columns=['ingredient']).set_index('ingredient')
    joined_df = df.join(ing_df, lsuffix = 'l', rsuffix='r')

    
    print joined_df[joined_df.num_recipes > 0]
    
    normalized_ingredient = joined_df['num_recipes'].idxmax()
    return normalized_ingredient if not type(normalized_ingredient) == float else ing

In [91]:
broadmatch("fermented bean curd")

                     num_recipes
ingredient                      
bean                       100.0
bean curd                   10.0
curd                        67.0
fermented bean curd          1.0


'bean'

In [45]:
norm_ing_arr = []
for i in range(len(ing_df.index)):
    print "%d: %s --> %s" % (i, ing_df.index[i], broadmatch(ing_df.index[i]))
    norm_ing_arr.append([ing_df.index[i], broadmatch(ing_df.index[i])])

In [48]:
norm_ing_df = pd.DataFrame(norm_ing_arr, columns = ['Original Ingredient', 'Broadmatched Ingredient'])
norm_ing_df.head(200)

Unnamed: 0,Original Ingredient,Broadmatched Ingredient
0,salt,salt
1,olive oil,olive oil
2,onion,onion
3,water,water
4,garlic,garlic
5,sugar,sugar
6,garlic cloves,garlic
7,butter,butter
8,ground black pepper,ground black pepper
9,allpurpose flour,allpurpose flour


1970

In [None]:
norm_ing_df.to_csv("broadmatched_ingredients.csv", index=False)