In [1]:
# Header Files
import pandas as pd
import csv
import numpy as np
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from csv import DictReader
from efficient_apriori import apriori as eff_app
import ast
from mlxtend.preprocessing import TransactionEncoder
from nltk.stem import WordNetLemmatizer
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from itertools import chain
import nltk
import random
import pickle

### Step 1.  Data Preprocessing ###

In [2]:
#Function to stem and tokenize the recipes
def stem_tokens(tokens):
    lemmatizer = WordNetLemmatizer() 
    stemmed_items = []
    for items in tokens:
        stemmed_items.append(lemmatizer.lemmatize(items))
    return stemmed_items

def tokenize(text):    
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens)
    return stems

In [3]:
#Load the kaggle dataset
# Create URL to JSON file (alternatively this can be a filepath)
url = 'data/train.json'

# Load the first sheet of the JSON file into a data frame
df = pd.read_json(url, orient='columns')

ingredients_1 = df['ingredients']
# View the first ten rows
train_list = ingredients_1.values.tolist()
print(train_list[0:10])

kaggle_cuisine_set = []
for ingr in ingredients_1:
    ingrStem = [' '.join(stem_tokens(tokenize(w))) for w in ingr]
    kaggle_cuisine_set.append(ingrStem)

[['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles'], ['plain flour', 'ground pepper', 'salt', 'tomatoes', 'ground black pepper', 'thyme', 'eggs', 'green tomatoes', 'yellow corn meal', 'milk', 'vegetable oil'], ['eggs', 'pepper', 'salt', 'mayonaise', 'cooking oil', 'green chilies', 'grilled chicken breasts', 'garlic powder', 'yellow onion', 'soy sauce', 'butter', 'chicken livers'], ['water', 'vegetable oil', 'wheat', 'salt'], ['black pepper', 'shallots', 'cornflour', 'cayenne pepper', 'onions', 'garlic paste', 'milk', 'butter', 'salt', 'lemon juice', 'water', 'chili powder', 'passata', 'oil', 'ground cumin', 'boneless chicken skinless thigh', 'garam masala', 'double cream', 'natural yogurt', 'bay leaf'], ['plain flour', 'sugar', 'butter', 'eggs', 'fresh ginger root', 'salt', 'ground cinnamon', 'milk', 'vanilla extract', 'ground ginger', 'powdered sugar', 'baking powder'], ['olive oil', 'salt', 

In [6]:
#Load the Schmidt Data Set
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

with np.load('data/simplified-recipes-1M.npz') as data:
    recipes = data['recipes']
    ingredients = data['ingredients']

schmitd_set = [] #name this to recipe list
#counter = 0
for recipe in recipes:
    if(recipe.size) == 0:
        continue
    schmitd_set.append(ingredients[recipe])

print(len(schmitd_set))

TypeError: <lambda>() got multiple values for keyword argument 'allow_pickle'

In [7]:
print(len(schmitd_set))

1067556


In [4]:
#Remove 'bad' ingredients such as 'sliced, prepared, etc'
badingr = []

file1 = open('data/baddata.txt', 'r') 
Lines = file1.readlines() 
for line in Lines: 
    badingr.append(line.strip())

final_set_badwords= schmitd_set
final_clean_data = []
for recipe in final_set_badwords:
    temprecipe = []
    for ingr in recipe:
        if ingr in badingr:
            continue
        temprecipe.append(ingr)
    final_clean_data.append(temprecipe)

print(len(final_clean_data))

1067556


In [5]:
#Dividing data into training and test sets

final_training_data = [] #training set to run with apriori
final_test_data = []     #represents ground truth

for row in final_clean_data:   #each row is a recipe i.e list of ingredients
    if len(row) == 0:
        continue    
    test_data = row.pop(random.randrange(len(row)))   #pops out random ingredients from the row
    final_test_data.append(test_data)
    final_training_data.append(row)
    
print(len(final_test_data))
print(len(final_training_data))

1067535
1067535


### Step 2.  Association Rule Mining ###

In [7]:
# Efficient Appriori
itemsets, rules = eff_app(final_training_data, min_support=0.08,  min_confidence=0.08)
print("Rules =>", rules)
type(rules)

Rules => [{pepper} -> {black pepper}, {black pepper} -> {pepper}, {flour} -> {butter}, {butter} -> {flour}, {salt} -> {butter}, {butter} -> {salt}, {sugar} -> {butter}, {butter} -> {sugar}, {salt} -> {flour}, {flour} -> {salt}, {sugar} -> {flour}, {flour} -> {sugar}, {olive oil} -> {garlic}, {garlic} -> {olive oil}, {onion} -> {garlic}, {garlic} -> {onion}, {pepper} -> {garlic}, {garlic} -> {pepper}, {salt} -> {garlic}, {garlic} -> {salt}, {pepper} -> {olive oil}, {olive oil} -> {pepper}, {salt} -> {olive oil}, {olive oil} -> {salt}, {pepper} -> {onion}, {onion} -> {pepper}, {salt} -> {onion}, {onion} -> {salt}, {salt} -> {pepper}, {pepper} -> {salt}, {sugar} -> {salt}, {salt} -> {sugar}, {water} -> {salt}, {salt} -> {water}, {pepper, salt} -> {garlic}, {garlic, salt} -> {pepper}, {garlic, pepper} -> {salt}, {salt} -> {garlic, pepper}, {pepper} -> {garlic, salt}, {garlic} -> {pepper, salt}]


list

In [None]:
#pickle an object and save it to a file
pickle.dump(rules, open("pickledRules", "wb"))

#reconstruct the pickled object
reconstructedRules = pickle.load(open("pickledRules", "rb"))
print(type(reconstructedRules[0]))

### Step 3.  Calculating Precision ###

In [None]:
#accessing the rules
testIngrList = ['salt']

fileObj =  open("pickledRules", "rb")
newRules = pickle.load(fileObj)

topKFinding = []

#for testIngr in testIngrList:
for r in newRules:
    if set(testIngrList) == set(list(r.lhs)):
            #Add Tuple of RHS and confidence to topKFinding
            topKFinding.append((list(r.rhs), r.confidence))

#Sort top k ingredients mapped from ARM in decreasing order of confidence
topKFinding.sort(key = lambda x: -x[1]) 

k = 5

topKFinding = topKFinding[0:k]
print(topKFinding)

In [None]:
index_found = 0
map_ingr = {}

# Counts no of times an ingr is found in ground truth
for top_ingr in topKFinding:
    map_ingr[top_ingr[0][0]] = 0

for recipe in final_training_data:    
    if set(testIngrList).issubset(set(recipe)):
        #Checks if ingredient found in ground truth is actually what we predicted
        if final_test_data[index_found] in map_ingr:
            #print(final_test_data[index_found])
            #if yes then increase count            
            map_ingr[final_test_data[index_found]] += 1
    index_found += 1

print(map_ingr)

In [None]:
#Calucating ARHR
k = 1
hits = 0
summation = 0

for top_ingr in topKFinding:
    # if there was a hit for rank k-th item
    if map_ingr[top_ingr[0][0]] > 0:     
        summation += 1.0 / k 
        hits += 1
    k += 1

arhr = summation/hits
print(arhr)

In [None]:
#Calucating Average Precision
k = 1
hits = 0
summation = 0

for top_ingr in topKFinding:
    # if there was a hit for rank k-th item
    if map_ingr[top_ingr[0][0]] > 0:     
        #print(hits + 1, k)
        summation += (hits + 1) / k 
        hits += 1
    k += 1

ap = summation/hits
print(ap)

In [17]:
ingredientFrequency = pickle.load(open("pickledIDX005sc.pkl", "rb"))
#ingredientFrequency = reconstructPickleObject(pickledCounterObject)
print(list(ingredientFrequency.keys()))

['basil leaves', 'focaccia', 'mozzarella', 'pesto', 'plum tomatoes', 'rosemary', 'sandwiches', 'tomatoes', 'balsamic vinegar', 'boiling water', 'butter', 'cooking spray', 'crumbled gorgonzola', 'currants', 'gorgonzola', 'grated orange', 'kosher salt', 'orange rind', 'parsley', 'pine nuts', 'polenta', 'toasted', 'vinegar', 'water', 'bottle', 'bouillon', 'carrots', 'celery', 'chicken bouillon', 'cilantro', 'clam juice', 'cloves', 'fish', 'garlic', 'medium shrimp', 'olive oil', 'onion', 'pepper', 'red pepper flakes', 'salt', 'sherry', 'shrimp', 'stewed tomatoes', 'white wine', 'grand marnier', 'kahlua', 'black pepper', 'coarse sea salt', 'fresh lemon', 'fresh lemon juice', 'lemon juice', 'lime', 'lime peel', 'mayonaise', 'sea salt', 'shallots', 'sherry wine', 'sherry wine vinegar', 'wine vinegar', 'blue cheese', 'buttermilk', 'cheese', 'chives', 'cider vinegar', 'cracked black pepper', 'ricotta', 'ricotta cheese', 'roasted', 'roasted garlic', 'worcestershire sauce', 'almonds', 'basmati', 