In [64]:
# Imports

from collections import Counter
import json

In [56]:
# Constants

INPUT_FILEPATH = '../dataset/kaggle_recipes.json'
OUTPUT_FILEPATH = '../output/graph_1.json'

In [5]:
# Load input file
filepath = INPUT_FILEPATH
with open(filepath, 'rb') as f:
    json_recipes = json.load(f)

In [13]:
# Get list of recipes
recipes = {jr['id']: {'cuisine': jr['cuisine'], 'ingredients': jr['ingredients']} \
           for jr in json_recipes}

In [30]:
# Process ingredients

ingredients = {}
counter = 0
for r_id in recipes.keys():  # for each recipe id...
    curr_recipe = recipes[r_id]
    curr_ingredients = curr_recipe['ingredients']
    for ing in curr_ingredients:  # for each ingredient associated to the recipe...
        if ing not in ingredients:
            ingredients[ing] = {'recipes': [], 'cuisines': Counter(), 'id': str(counter)}
            counter += 1
        ingredients[ing]['recipes'].append(r_id)
        ingredients[ing]['cuisines'][recipes[r_id]['cuisine']] += 1

In [35]:
# Find connections between ingredients, i.e. ingredients sharing at least one recipe

ing2ingredients = {}
for i, ing_label in enumerate(ingredients.keys()):  # for each ingredient label...
    if i % 1000 == 0:
        print i, '/', len(ingredients.keys())
    ing = ingredients[ing_label]
    ing_id = ing['id']
    ing2ingredients[ing_id] = Counter()
    curr_recipes = ingredients[ing_label]['recipes']
    for r_id in curr_recipes:  # for each recipe id associated to the current ingredient...
        recipe = recipes[r_id]
        for curr_ingredient in recipe['ingredients']:  # for each ingredient included in the current recipe...
            if curr_ingredient != ing:  # ...if the ingredient is different from the first one...
                ing_id_1 = ingredients[curr_ingredient]['id']
                ing2ingredients[ing_id][ing_id_1] += 1  # ...increase the counter of the connection

0 / 6714
1000 / 6714
2000 / 6714
3000 / 6714
4000 / 6714
5000 / 6714
6000 / 6714


In [74]:
# Create graph dict (output)
# Each item is an ingredient, identified by a numeric id (in string format); represents a graph node.
# Each item includes: edges, label, number of recipes containing the ingredient, main cuisine.

graph = {}
for ing_label in ingredients.keys():  # for each ingredient label...
    ing_id = ingredients[ing_label]['id']
    graph[ing_id] = {}
    graph[ing_id]['edges'] = ing2ingredients[ing_id].keys()
    graph[ing_id]['label'] = ing_label
    graph[ing_id]['n_recipes'] = len(ingredients[ing_label])
    curr_cuisines = ingredients[ing_label]['cuisines']
    main_cuisine_idx = curr_cuisines.values().index(max([curr_cuisines[r] \
                                                    for r in curr_cuisines]))
    graph[ing_id]['main_cuisine'] = curr_cuisines.keys()[main_cuisine_idx]

In [75]:
# Write graph dict to json file
output_filepath = OUTPUT_FILEPATH
with open(output_filepath, 'wb+') as f:
    json.dump(graph, f, indent=2, sort_keys=True)

In [76]:
# Analyze main cuisine values - just for fun

# Count number of ingredients associated to each cuisine type
main_cuisines = Counter()
for ing_id in graph.keys():
    main_cuisines[graph[ing_id]['main_cuisine']] += 1

# Sort by counter value
print "Number of ingredients associated to each cuisine type"
main_cuisines = [(cuisine, main_cuisines[cuisine]) for cuisine in main_cuisines.keys()]
main_cuisines = sorted(main_cuisines, key=lambda x:x[1], reverse=True)

# Print values
for mc in main_cuisines:
    print mc[0] + ':', mc[1]

Number of ingredients associated to each cuisine type
italian: 1257
mexican: 1115
southern_us: 810
chinese: 500
french: 495
indian: 477
japanese: 279
cajun_creole: 278
thai: 247
british: 169
greek: 169
korean: 128
irish: 126
filipino: 110
vietnamese: 108
russian: 108
spanish: 90
moroccan: 86
jamaican: 84
brazilian: 78
