# Predicting cuisine by provided ingredients

Goal is to use provided recipe ingredients to classify the cuisine

## Part 1: Exploratory analysis

In [20]:
import pandas as pd
import os
import json
import string
import math
import itertools

Loading of whats-cooking dataset

In [21]:
curr_path = os.path.dirname(os.getcwd())
data_path = os.path.join(curr_path, 'whats-cooking\\data')
input_file = os.path.join(data_path, 'train.json')

In [22]:
with open(input_file) as file:
    data = json.load(file)
    df = pd.DataFrame(data)

In [23]:
cuisine_counts = dict(df['cuisine'].value_counts())

In [24]:
from collections import Counter

counters = {}
for cuis in cuisine_counts.keys():
    counters[cuis] = Counter()
    indices = (df['cuisine'] == cuis)
    for ingr in df[indices]['ingredients']:
        counters[cuis].update(ingr)

In [25]:
counters['italian']

Counter({'sugar': 760,
         'pistachio nuts': 7,
         'white almond bark': 1,
         'flour': 142,
         'vanilla extract': 219,
         'olive oil': 3111,
         'almond extract': 56,
         'eggs': 627,
         'baking powder': 186,
         'dried cranberries': 8,
         'chopped tomatoes': 37,
         'fresh basil': 787,
         'garlic': 1471,
         'extra-virgin olive oil': 1362,
         'kosher salt': 656,
         'flat leaf parsley': 588,
         'pimentos': 16,
         'sweet pepper': 7,
         'dried oregano': 626,
         'sharp cheddar cheese': 9,
         'pepper': 965,
         'swiss cheese': 7,
         'provolone cheese': 138,
         'canola oil': 41,
         'mushrooms': 184,
         'black olives': 67,
         'sausages': 58,
         'Italian parsley leaves': 74,
         'walnuts': 38,
         'hot red pepper flakes': 76,
         'fresh lemon juice': 471,
         'trout fillet': 3,
         'garlic cloves': 1619,
         'c

## Ingredients

We will look at the most common ingredients overally and at the most common ingredients for each cuisine:

In [26]:
ingredients_counts = {}

for recipe_ingr in df['ingredients']:
    for ingr in recipe_ingr:
        count = ingredients_counts.get(ingr.lower(), 0)
        ingredients_counts[ingr.lower()] = count + 2

In [27]:
ingredients_counts

{'romaine lettuce': 540,
 'black olives': 458,
 'grape tomatoes': 456,
 'garlic': 14760,
 'pepper': 8876,
 'purple onion': 3792,
 'seasoning': 274,
 'garbanzo beans': 296,
 'feta cheese crumbles': 716,
 'plain flour': 308,
 'ground pepper': 770,
 'salt': 36098,
 'tomatoes': 6116,
 'ground black pepper': 9570,
 'thyme': 722,
 'eggs': 6776,
 'green tomatoes': 216,
 'yellow corn meal': 682,
 'milk': 4526,
 'vegetable oil': 8770,
 'mayonaise': 1562,
 'cooking oil': 966,
 'green chilies': 1536,
 'grilled chicken breasts': 10,
 'garlic powder': 2884,
 'yellow onion': 2368,
 'soy sauce': 6592,
 'butter': 9696,
 'chicken livers': 130,
 'water': 14914,
 'wheat': 52,
 'black pepper': 5254,
 'shallots': 2954,
 'cornflour': 206,
 'cayenne pepper': 3046,
 'onions': 15944,
 'garlic paste': 564,
 'lemon juice': 2790,
 'chili powder': 4072,
 'passata': 48,
 'oil': 3940,
 'ground cumin': 5494,
 'boneless chicken skinless thigh': 686,
 'garam masala': 1850,
 'double cream': 80,
 'natural yogurt': 36,
 '

In [28]:
n_ingredients = len(ingredients_counts)
n_ingredients

6703

# Data preprocessing

Many ingredients have more than one name, like for example olive oil, soy sauce, garlic cloves, or some words use different words for describing the same ingredient like tometoes, sliced tomatoes, diced tomatoes, chopped tomatoes, plum tomatoes...

## Lower-casing

In [29]:
df['ingredients'] = df['ingredients'].apply(lambda i: list(map(lambda x: x.lower(), i)))

## Punctation removal

In [30]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [31]:
df['ingredients'].apply(lambda i: list(map(lambda x: remove_punctuations(x), i)))

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
5        [plain flour, sugar, butter, eggs, fresh ginge...
6        [olive oil, salt, medium shrimp, pepper, garli...
7        [sugar, pistachio nuts, white almond bark, flo...
8        [olive oil, purple onion, fresh pineapple, por...
9        [chopped tomatoes, fresh basil, garlic, extrav...
10       [pimentos, sweet pepper, dried oregano, olive ...
11       [low sodium soy sauce, fresh ginger, dry musta...
12       [italian parsley leaves, walnuts, hot red pepp...
13       [ground cinnamon, fresh cilantro, chili powder...
14       [fresh parmesan cheese, butter, allpurpose flo...
15       [tumeric, vegetable stock, tomatoes, garam mas...
16       [greek yogurt, lemon curd, confectioners sugar.

## Market basket analysis

In [32]:
def itemSetSupport(item_set, transactions):
    count = 0
    for transaction in transactions:
        if item_set.issubset(transaction):
            count = count + 1
    return count

In [33]:
def subset_check(to_add, L, k):
    for subset in itertools.combinations(L, k - 1):
        if subset not in L:
            return False
    return True

In [34]:
def apriori_gen(L, k):
    result = []
    for a in L:
        for b in L:
            if a != b and len(a.intersection(b)) == k - 2: 
                to_add = a.union(b)
                if subset_check(to_add, L, k):
                    result.append(to_add)
    return result

In [35]:
def apriori(ingr_counts, transactions, treshold):
    ingr_over_treshold = {k: v for k, v in ingr_counts.items() if v >= treshold}
    Ck_list = itertools.product(ingr_over_treshold, ingr_over_treshold)
    
    Ck = set(map(lambda x: frozenset(x), Ck_list))
    Lk = list(filter(lambda x: len(x) > 1 and itemSetSupport(x, transactions) >= treshold, Ck))
    
    result = Lk.copy()
    k = 3
    
    while True:
        Ck = apriori_gen(Lk, k)
        Lk = list(filter(lambda x: itemSetSupport(x, transactions) >= treshold, Ck))
        if len(Lk) > 0:
            result.append(Lk)
            k = k + 1
        else:
            break
    return result

In [36]:
def rules(items, transactions, rel_treshold):
    result = []
    items_support =  itemSetSupport(items, transactions)
    for i in range(1, len(items)):
        for comb in itertools.combinations(items, i):
            comb_set = set(comb)
            support = items_support / itemSetSupport(comb_set, transactions)
            if support >= rel_treshold:
                result.append((comb_set, items - comb_set, support, items_support / len(transactions), items_support))
    return result

In [37]:
def cuisineRules(cuisine, apriori_treshold, rel_treshold):
    cuisine_recipes = df[df['cuisine'] == cuisine]
    cuisine_transactions = list(map(lambda x: set(x), cuisine_recipes['ingredients']))
    
    ingr_result = apriori(counters[cuisine], cuisine_transactions, apriori_treshold)
    
    for s in ingr_result:
        rule_result = rules(s, cuisine_transactions, rel_treshold)
        for i in range(0, len(rule_result)):
            if 'salt' not in rule_result[i][1]:
                print_rule(rule_result[i])

In [38]:
def print_rule(rule_tuple):
    print(rule_tuple[0], ' -> ', rule_tuple[1], '; Rel = ', '{:05.2f}'.format(rule_tuple[2] * 100), '; Sup = ', '{:05.2f}'.format(rule_tuple[3] * 100), '(', rule_tuple[4], ')')

In [39]:
for cuisine in cuisine_counts.keys():
    print(cuisine)
    cuisineRules(cuisine, math.sqrt(cuisine_counts[cuisine]), 0.7)

italian
{'baking powder'}  ->  frozenset({'all-purpose flour'}) ; Rel =  82.80 ; Sup =  01.96 ( 154 )
mexican
{'onion powder'}  ->  frozenset({'garlic powder'}) ; Rel =  75.41 ; Sup =  02.14 ( 138 )
southern_us
{'onion powder'}  ->  frozenset({'garlic powder'}) ; Rel =  75.86 ; Sup =  02.04 ( 88 )
indian
{'green cardamom'}  ->  frozenset({'clove'}) ; Rel =  71.43 ; Sup =  01.83 ( 55 )
{'bay leaf'}  ->  frozenset({'onions'}) ; Rel =  75.42 ; Sup =  02.96 ( 89 )
chinese
{'flank steak'}  ->  frozenset({'corn starch'}) ; Rel =  76.39 ; Sup =  02.06 ( 55 )
{'napa cabbage'}  ->  frozenset({'soy sauce'}) ; Rel =  73.81 ; Sup =  02.32 ( 62 )
{'chicken broth'}  ->  frozenset({'soy sauce'}) ; Rel =  70.91 ; Sup =  04.38 ( 117 )
french
cajun_creole
{'celery ribs'}  ->  frozenset({'onions'}) ; Rel =  75.14 ; Sup =  08.41 ( 130 )
{'bell pepper'}  ->  frozenset({'onions'}) ; Rel =  76.47 ; Sup =  03.36 ( 52 )
{'onion powder'}  ->  frozenset({'garlic powder'}) ; Rel =  77.27 ; Sup =  04.40 ( 68 )
tha