In [2]:
import pandas as pd
from scipy import stats
import math

In [3]:
df = pd.read_json("train.json", orient='columns')

In [4]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
df = df.drop(columns={"id"})

In [6]:
df.head()

Unnamed: 0,cuisine,ingredients
0,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,"[water, vegetable oil, wheat, salt]"
4,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
df.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

#### Count ingredients per cuisine and total ingredient occurrences

In [8]:
cuisines = {}
ingredients = {}
for cuisine in list(df.cuisine.unique()):
    
    cuisine_count_ingredients = {}   
    i_rows = 0
    for index, row in df.loc[df["cuisine"] == cuisine].iterrows():
        for ingredient in row["ingredients"]:
            if ingredient in cuisine_count_ingredients:
                cuisine_count_ingredients[ingredient] += 1
            else:
                cuisine_count_ingredients[ingredient] = 1
            if ingredient in ingredients:
                ingredients[ingredient] += 1
            else:
                ingredients[ingredient] = 1
        i_rows += 1
    cuisine_count_ingredients_normalized = {k: v/i_rows for k, v in cuisine_count_ingredients.items()}
    cuisines[cuisine] = cuisine_count_ingredients_normalized

In [9]:
cuisines["greek"]

{'romaine lettuce': 0.033191489361702124,
 'black olives': 0.026382978723404255,
 'grape tomatoes': 0.022127659574468085,
 'garlic': 0.18382978723404256,
 'pepper': 0.1727659574468085,
 'purple onion': 0.15829787234042553,
 'seasoning': 0.002553191489361702,
 'garbanzo beans': 0.01957446808510638,
 'feta cheese crumbles': 0.214468085106383,
 'ground pork': 0.003404255319148936,
 'finely chopped fresh parsley': 0.006808510638297872,
 'onions': 0.1574468085106383,
 'salt': 0.4868085106382979,
 'vinegar': 0.002553191489361702,
 'caul fat': 0.000851063829787234,
 'minced garlic': 0.05787234042553192,
 'dried oregano': 0.2272340425531915,
 'red wine vinegar': 0.08425531914893616,
 'olive oil': 0.428936170212766,
 'boneless chop pork': 0.000851063829787234,
 'lemon juice': 0.15574468085106383,
 'orange': 0.010212765957446808,
 'anise': 0.00425531914893617,
 'cinnamon sticks': 0.026382978723404255,
 'unflavored gelatin': 0.001702127659574468,
 'zinfandel': 0.001702127659574468,
 'orange bloss

In [10]:
cuisines["japanese"]

{'sirloin': 0.0014054813773717498,
 'mirin': 0.28250175685172174,
 'yellow onion': 0.014757554462403373,
 'low sodium soy sauce': 0.044975404075895994,
 'water': 0.2691496837666901,
 'corn oil': 0.0021082220660576245,
 'sugar': 0.2817990161630358,
 'green onions': 0.12297962052002812,
 'glass noodles': 0.0021082220660576245,
 'sake': 0.20309205903021785,
 'shiitake': 0.04848910751932537,
 'napa cabbage': 0.019676739283204497,
 'melted butter': 0.0028109627547434997,
 'matcha green tea powder': 0.007027406886858749,
 'white sugar': 0.0421644413211525,
 'milk': 0.036542515811665496,
 'all-purpose flour': 0.04848910751932537,
 'eggs': 0.10119465917076599,
 'salt': 0.2965565706254392,
 'baking powder': 0.027406886858749122,
 'chopped walnuts': 0.0007027406886858749,
 'top round steak': 0.0007027406886858749,
 'vegetable oil': 0.17217146872803935,
 'soy sauce': 0.3942375263527758,
 'fresh asparagus': 0.0007027406886858749,
 'prawns': 0.008432888264230498,
 'rice flour': 0.005621925509486999

#### Ingredients sorted on amount of occurrences

In [15]:
import operator
sorted_ingredients = sorted(ingredients.items(), key=operator.itemgetter(1))[::-1]
sorted_ingredients = [x[0] for x in sorted_ingredients]
sorted_ingredients[:50]

['salt',
 'olive oil',
 'onions',
 'water',
 'garlic',
 'sugar',
 'garlic cloves',
 'butter',
 'ground black pepper',
 'all-purpose flour',
 'pepper',
 'vegetable oil',
 'eggs',
 'soy sauce',
 'kosher salt',
 'green onions',
 'tomatoes',
 'large eggs',
 'carrots',
 'unsalted butter',
 'ground cumin',
 'extra-virgin olive oil',
 'black pepper',
 'milk',
 'chili powder',
 'oil',
 'red bell pepper',
 'purple onion',
 'scallions',
 'grated parmesan cheese',
 'sesame oil',
 'corn starch',
 'ginger',
 'baking powder',
 'jalapeno chilies',
 'dried oregano',
 'chopped cilantro fresh',
 'fresh lemon juice',
 'diced tomatoes',
 'fresh parsley',
 'minced garlic',
 'chicken broth',
 'sour cream',
 'cayenne pepper',
 'fresh ginger',
 'brown sugar',
 'cooking spray',
 'shallots',
 'garlic powder',
 'lime']

#### Total amount of different ingredients

In [81]:
s = []
for index, row in df.iterrows():
    for ing in row["ingredients"]:
        s.append(ing)
s = set(s)
s = list(s)
print(len(s))

6714


#### similarity between rankings

In [200]:
sorted_greek = sorted(cuisines["greek"].items(), key=operator.itemgetter(1))[::-1]
greek = [x[0] for x in sorted_greek]

In [201]:
greek[:10]

['salt',
 'olive oil',
 'dried oregano',
 'garlic cloves',
 'feta cheese crumbles',
 'extra-virgin olive oil',
 'fresh lemon juice',
 'ground black pepper',
 'garlic',
 'pepper']

In [202]:
sorted_greek = sorted(cuisines["italian"].items(), key=operator.itemgetter(1))[::-1]
japanese = [x[0] for x in sorted_greek]

In [203]:
japanese[:10]

['salt',
 'olive oil',
 'garlic cloves',
 'grated parmesan cheese',
 'garlic',
 'ground black pepper',
 'extra-virgin olive oil',
 'onions',
 'water',
 'butter']

In [204]:
tau, p_value = stats.spearmanr(greek[:100], japanese[:100])

In [205]:
tau

0.112019201920192

In [366]:
cuisine_similarities = []
for c, cuisine_1 in enumerate(list(df.cuisine.unique())):
    similarities = []
    cuisine_1_sorted = sorted(cuisines[cuisine_1].items(), key=operator.itemgetter(1))[::-1]
    cuisine_1_rank = [x[0] for x in cuisine_1_sorted]
    for cuisine_2 in list(df.cuisine.unique()):
        cuisine_2_sorted = sorted(cuisines[cuisine_2].items(), key=operator.itemgetter(1))[::-1]
        cuisine_2_rank = [x[0] for x in cuisine_2_sorted]
        similarities.append(stats.spearmanr(cuisine_1_rank[:100], cuisine_2_rank[:100])[0])
        
    # normalize    
    normalized = [(float(i)-min(similarities))/(max(similarities)-min(similarities)) for i in similarities]

    
    # diagonal to 0
    normalized[c] = 0.0
    
    # increase differences to enlarge visual differences
    increased_differences = []
    for x in normalized:
        if x > 0:
            increased_differences.append(x**9)
        else:
            increased_differences.append(x)

    # round    
#     rounded = [math.floor(float(x)*10)/10 for x in increased_differences]
    
    # sum to 1 
    sum_to_one = [float(i) / sum(increased_differences) for i in increased_differences]
        
            
    cuisine_similarities.append(sum_to_one)


In [367]:
sum(cuisine_similarities[15])

0.9999999999999999

In [368]:
cuisine_similarities[15]

[0.14697945561140044,
 0.0167648197373896,
 0.17084143431950932,
 0.10679660663116179,
 2.6503999830762403e-10,
 0.019785419649702162,
 0.00019891760413469028,
 0.12704620571281666,
 1.754375941295062e-06,
 0.0,
 0.00033461291592296256,
 0.005825830544688508,
 6.702271008196983e-06,
 0.08440349701185591,
 9.257276482428852e-08,
 0.0,
 0.000619783589527858,
 0.32025280129502165,
 5.067669876717994e-05,
 9.138919334699776e-05]

#### Save similarities

In [352]:
import json
with open('similarities.json', 'w') as outfile:
    json.dump(cuisine_similarities, outfile)