In [1]:
import pandas as pd
import numpy as np
import tqdm
import json
import networkx as nx
import matplotlib.pyplot as plt
import collections

### Load data

In [2]:
#recipes
cleaned_recipes = json.load(open("./../generated/high_score_repr_recipes.json"))

#mapping
ing_mapping = json.load(open("./../generated/ing_usda_mapping_high_score.json"))

#group description
columns = ["food_group_id", "food_group_name"]
food_groups = pd.read_csv("./../data/usda/FD_GROUP.txt", sep="^", encoding="ISO-8859-1", names=columns, header=None)

#id mapping
columns = ["food_id", "food_group_id"]
use_cols = [0, 1]
food_des = pd.read_csv("./../data/usda/FOOD_DES.txt", sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

### Create food groups dictionnaries

In [3]:
#useful ids
usda_ids = [ing_mapping[ing]for ing in ing_mapping]

#matching ids dict
ids_dict = dict()
for usda_id in usda_ids :
    matching_group = int(food_des[food_des["food_id"] == usda_id]['food_group_id'].values[0])
    ids_dict[usda_id] = matching_group
    
json.dump(ids_dict, open("./../generated/food_to_group_high_score.json", 'w'))

#food groups description ids
group_des = dict()
for entry in food_groups.values :
    group_des[entry[0]] = entry[1]

json.dump(group_des, open("./../generated/group_des.json", 'w'))

### generate recipe's food groups vectors

In [4]:

food_groups_index = sorted(food_groups['food_group_id'])
food_groups_names = [group_des[fg_id] for fg_id in food_groups_index]

def recipe_to_ids(recipe) : return [int(ing[8:]) for ing in recipe if ing[:8] == 'usda_id=']
          
def ids_to_food_groups(ids) : return [ids_dict[i] for i in ids]

def food_groups_to_vector(food_groups) :
    count_vector = [0]*len(food_groups_index)
    for fg_id in food_groups :
        count_vector[food_groups_index.index(fg_id)] += 1
        
    return count_vector

def recipe_to_vector(recipe) : return food_groups_to_vector(ids_to_food_groups(recipe_to_ids(recipe)))    


#generate all vectors
vectors_recipes = []
for r in tqdm.tqdm(cleaned_recipes) :
    vectors_recipes.append(recipe_to_vector(r))
    
#translate to np array now that the append phase is over
vectors_recipes = np.array(vectors_recipes)    

100%|██████████| 96529/96529 [00:00<00:00, 140529.75it/s]


## vectors study

#### basic statistics

We first study the occurences of the food groups

In [8]:
sorted([c for c in zip(vectors_recipes.sum(axis=0), food_groups_names)], key= lambda x : x[0])

[(547, 'Breakfast Cereals'),
 (1064, 'Restaurant Foods'),
 (1111, 'Baby Foods'),
 (1334, 'Lamb, Veal, and Game Products'),
 (1621, 'Meals, Entrees, and Side Dishes'),
 (1781, 'Fast Foods'),
 (2851, 'American Indian/Alaska Native Foods'),
 (3045, 'Finfish and Shellfish Products'),
 (3753, 'Poultry Products'),
 (3753, 'Beef Products'),
 (4067, 'Pork Products'),
 (6859, 'Snacks'),
 (10657, 'Nut and Seed Products'),
 (11018, 'Sausages and Luncheon Meats'),
 (23411, 'Legumes and Legume Products'),
 (25105, 'Soups, Sauces, and Gravies'),
 (29199, 'Beverages'),
 (31180, 'Cereal Grains and Pasta'),
 (31381, 'Baked Products'),
 (36060, 'Sweets'),
 (37571, 'Fruits and Fruit Juices'),
 (53487, 'Fats and Oils'),
 (99070, 'Dairy and Egg Products'),
 (145332, 'Spices and Herbs'),
 (169324, 'Vegetables and Vegetable Products')]

> As expected, the spices and the vegetables are the most common ingredients used. We notice that the snacks have been used a lot which is rather surprising.

Now, what is the mean recipe composition ?

In [12]:
mean_ings = sorted([c for c in zip(vectors_recipes.mean(axis=0), food_groups_names)], key= lambda x : x[0])
print("mean number of ingredients : ", sum([c[0] for c in mean_ings]))
mean_ings

mean number of ingredients :  7.609951413564835


[(0.00566669083902247, 'Breakfast Cereals'),
 (0.01102259424628868, 'Restaurant Foods'),
 (0.011509494556040153, 'Baby Foods'),
 (0.013819681132095018, 'Lamb, Veal, and Game Products'),
 (0.01679288089589657, 'Meals, Entrees, and Side Dishes'),
 (0.01845041386526329, 'Fast Foods'),
 (0.029535165597903222, 'American Indian/Alaska Native Foods'),
 (0.031544924323260365, 'Finfish and Shellfish Products'),
 (0.0388795077127081, 'Poultry Products'),
 (0.0388795077127081, 'Beef Products'),
 (0.04213241616509028, 'Pork Products'),
 (0.07105636648053952, 'Snacks'),
 (0.11040205534088202, 'Nut and Seed Products'),
 (0.11414186410301567, 'Sausages and Luncheon Meats'),
 (0.2425281521615266, 'Legumes and Legume Products'),
 (0.26007728247469675, 'Soups, Sauces, and Gravies'),
 (0.30248940732836765, 'Beverages'),
 (0.32301173740533934, 'Cereal Grains and Pasta'),
 (0.32509401319810627, 'Baked Products'),
 (0.37356649297102423, 'Sweets'),
 (0.3892198199504812, 'Fruits and Fruit Juices'),
 (0.554102

Those results are interesting but we agglomerate all the recipes together and lose the ability to distinguish some recipes categories. We now try to discover hidden recipe compositions patterns by clustering the recipes.

#### clustering