In [2]:
import pandas as pd
import numpy as np

## 1 - Probability of Ingredient given a Dish (Top-Down)

## Dish Counts

In [23]:
train_images = open('datasets/Recipes5k/annotations/train_images.txt','r')
images_list = train_images.readlines()

In [30]:
dishes = [image.split('/')[0] for image in images_list]

In [37]:
dishes_df = pd.DataFrame(dishes, columns = ['Dish'])

In [56]:
dish_count = dishes_df.groupby(['Dish']).size().reset_index(name = 'count')

In [57]:
dish_count.to_csv('FoodAnalysis/dish_count.csv')

## Random dishes

In [134]:
dish_names = pd.read_csv('FoodAnalysis/dish_count.csv')

sel_dishes = dish_names.Dish[np.random.randint(0,101,size=10)]

dish_names[dish_names.Dish.isin(sel_dishes)][['Dish','count']].to_csv('FoodAnalysis/10_dish_count.csv', index = False)

In [135]:
sel_dish_count = pd.read_csv('FoodAnalysis/10_dish_count.csv')

## Specific dishes

##### Count the number of recipes for each dish

In [9]:
dish_names = pd.read_csv('FoodAnalysis/dish_count.csv')

# desserts = ['apple_pie', 'bread_pudding', 'carrot_cake', 'cannoli','cheesecake',
#            'chocolate_cake','chocolate_ice_cream','chocolate_mousse','churros',
#            'cupcakes','donuts','frozen_yogurt','macarons','panna_cotta',
#            'pancakes','red_velvet_cake','strawberry_shortcake','tiramisu','waffles']

# desserts
sel_dishes = ['apple_pie','carrot_cake','cheesecake','chocolate_mousse','tiramisu','panna_cotta',
            'waffles','red_velvet_cake','cupcakes','frozen_yogurt']

dish_names[dish_names.Dish.isin(sel_dishes)][['Dish','count']].to_csv('FoodAnalysis/10_dish_count.csv', index = False)

In [143]:
sel_dish_count = pd.read_csv('FoodAnalysis/10_dish_count.csv')

##### Dish - Ingredient Occurrences

In [144]:
#load file with dish-ingredient occurrences
#obtained with Prepr_2_Dish_Ingr_Occurrences
dish_ingr_occ = pd.read_csv('FoodAnalysis/dish_ingredient_occurrences.csv')

#select dishes
sel_dish_ingr_occ = dish_ingr_occ[dish_ingr_occ.Dish.isin(sel_dishes)]

We are using base ingredients, then the number of dish-base ingredient occurrences might be larger than the number of recipes for each dish. This is because in each recipe the same base ingredient could be used in more than one derived ingredient.

In [145]:
dishes_tbl_df = sel_dish_ingr_occ.merge(sel_dish_count, how = 'left', left_on='Dish' , right_on='Dish')

In [146]:
def Ingredient_Probability(df):
    
    if df['Occurrence']/df['count'] < 1:
        
        prob = df['Occurrence']/df['count']
        
    else:
        
        prob = 1
        
    return prob

Calculate the probability in a frequentist way

In [147]:
dishes_tbl_df['Ingredient Probability'] = dishes_tbl_df.apply(Ingredient_Probability, axis=1)

In [148]:
dishes_tbl_df = dishes_tbl_df[['Dish','Ingredient','Ingredient Probability']]

In [149]:
f = open('FoodAnalysis/dish_ingr_probabilities.txt','w')

f.write('Dish, Ingredient, Ingredient Probability\n')

for elem in dishes_tbl_df.values:
    
    dish_ingr = (elem[0],elem[1])
    f.write(str((dish_ingr, elem[2]))+'\n')
    
f.close()

## Probability of a Dish given an Ingredient (Bottom-Up)

In [125]:
dish_ingr_df = pd.read_csv('FoodAnalysis/dish_ingredient_relation.txt',sep=',',names=['Dish','Ingredient'])

In [154]:
dish_ingr_copy_df = dish_ingr_df.copy()
dish_ingr_copy_df = dish_ingr_copy_df[dish_ingr_copy_df.Dish.isin(sel_dishes)]
dish_ingr_copy_df['Occurrence'] = 1
#
Ingr_occurrences_df = dish_ingr_copy_df[['Ingredient','Occurrence']].groupby(['Ingredient']).count()
#
Dish_Ingr_occurrences_df = dish_ingr_copy_df[['Ingredient','Dish','Occurrence']].groupby(['Ingredient','Dish']).count()

In [155]:
Ingr_occurrences_df = Ingr_occurrences_df.rename(columns={"Occurrence":"Times Ingredient in the Ontology"})
Dish_Ingr_occurrences_df = Dish_Ingr_occurrences_df.rename(columns={"Occurrence":"Times Ingredient in the Recipe Dish"})

In [156]:
dish_ingr_df = dish_ingr_df[dish_ingr_df.Dish.isin(sel_dishes)]
dish_ingr_df = dish_ingr_df.drop_duplicates()
Ingr_Dish_df = dish_ingr_df[['Ingredient','Dish']]
Ingr_occurrence_ontology = Ingr_Dish_df.merge(Ingr_occurrences_df, how='left', left_on = 'Ingredient', right_index=True)
Ingr_Dish_occurrence_df = Ingr_occurrence_ontology.merge(Dish_Ingr_occurrences_df, how='left', left_on=['Ingredient','Dish'], right_index=True)

In [165]:
Ingr_Dish_occurrence_df[Ingr_Dish_occurrence_df.Ingredient == 'maple']

Unnamed: 0,Ingredient,Dish,Times Ingredient in the Ontology,Times Ingredient in the Recipe Dish
119,maple,apple_pie,1,1


In [158]:
def Dish_probability(df):
    
    prob = df['Times Ingredient in the Recipe Dish']/df['Times Ingredient in the Ontology']
    
    return prob

In [159]:
Ingr_Dish_Bottom_Up_Probability_df = Ingr_Dish_occurrence_df.copy()
Ingr_Dish_Bottom_Up_Probability_df['Probability Ingr-Dish bottom-up'] = Ingr_Dish_Bottom_Up_Probability_df.apply(Dish_probability,axis=1)

In [167]:
Ingr_Dish_Bottom_Up_Probability_df = Ingr_Dish_Bottom_Up_Probability_df[['Ingredient','Dish','Probability Ingr-Dish bottom-up']]
Ingr_Dish_Bottom_Up_Probability_df.to_csv('FoodAnalysis/ingredient_dish_bottom_up_probabilities.csv')

## 2 - Probability of Ingredients Coexistence

https://stackoverflow.com/questions/942543/operation-on-every-pair-of-element-in-a-list

In [105]:
#load file with dish-ingredient occurrences
dish_recipe_file = pd.read_csv('FoodAnalysis/tuple_dish_ingredients.txt', sep='|', header = None)
dish_recipe_file.columns = ['Dish','Ingredients']
# desserts
sel_dishes = ['apple_pie','carrot_cake','cheesecake','chocolate_mousse','tiramisu','panna_cotta',
            'waffles','red_velvet_cake','cupcakes','frozen_yogurt']

dish_recipe_file = dish_recipe_file[dish_recipe_file.Dish.isin(sel_dishes)]

In [86]:
dish_recipe_file.head()

Unnamed: 0,Dish,Ingredients
0,apple_pie,"['apple', 'margarine', 'flour', 'water', 'oats..."
1,apple_pie,"['nut', 'cinnamon', 'apple', 'sugar', 'flour',..."
2,apple_pie,"['apple', 'flour', 'sugar', 'salt', 'gin', 'ci..."
3,apple_pie,"['apple', 'sugar', 'cinnamon', 'lemon', 'flour..."
4,apple_pie,"['pie', 'apple', 'cinnamon', 'water', 'vanilla..."


In [87]:
#Convert each value of Ingredients column into a list
dish_recipe_file['Ingredients'] = dish_recipe_file.loc[:,'Ingredients'].apply(lambda x: x.replace('[','').replace(']',''))
dish_recipe_file['Ingredients'] = dish_recipe_file.loc[:,'Ingredients'].apply(lambda x: x.replace('\'','').replace(' ','').split(','))


In [88]:
dish_recipe_file = dish_recipe_file[dish_recipe_file.Dish.isin(sel_dishes)]

In [89]:
def all_possible_pair_excluding_duplicates_unique(list_of_values):
    
    result = [(list_of_values[p1], list_of_values[p2]) for p1 in range(len(list_of_values)) for p2 in range(p1+1,len(list_of_values)) if list_of_values[p1] != list_of_values[p2]]
    
    return result

In [90]:
dish_recipe_file['Ingredients'] = dish_recipe_file['Ingredients'].apply(all_possible_pair_excluding_duplicates_unique)

In [91]:
dish_recipe_file.head()

Unnamed: 0,Dish,Ingredients
0,apple_pie,"[(apple, margarine), (apple, flour), (apple, w..."
1,apple_pie,"[(nut, cinnamon), (nut, apple), (nut, sugar), ..."
2,apple_pie,"[(apple, flour), (apple, sugar), (apple, salt)..."
3,apple_pie,"[(apple, sugar), (apple, cinnamon), (apple, le..."
4,apple_pie,"[(pie, apple), (pie, cinnamon), (pie, water), ..."


In [92]:
col_expl = 'Ingredients'

df = dish_recipe_file

df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns
        }
    )

index = 0


for i in range(len(df)):
    for j in range(len(df[col_expl].values[i])):
            
             
        df_out.loc[index,col_expl] = tuple(sorted(df[col_expl].values[i][j]))
        index +=1

df_out.head()

Unnamed: 0,Dish,Ingredients
0,apple_pie,"(apple, margarine)"
1,apple_pie,"(apple, flour)"
2,apple_pie,"(apple, water)"
3,apple_pie,"(apple, oats)"
4,apple_pie,"(apple, sugar)"


In [99]:
df_out.Dish.unique()

array(['apple_pie', 'carrot_cake', 'red_velvet_cake', 'frozen_yogurt',
       'cheesecake', 'cupcakes', 'tiramisu', 'waffles', 'panna_cotta',
       'chocolate_mousse'], dtype=object)

In [106]:
#Functions

def all_possible_pair_excluding_duplicates_unique(list_of_values):
    
    result = [(list_of_values[p1], list_of_values[p2]) for p1 in range(len(list_of_values)) for p2 in range(p1+1,len(list_of_values)) if list_of_values[p1] != list_of_values[p2]]
    
    return result

def explode_ingredients(df, col_expl):
    #explode the ingredients of a recipe
    
    df[col_expl] = df.loc[:,col_expl].apply(lambda x: x.replace('[','').replace(']',''))
    df[col_expl] = df.loc[:,col_expl].apply(lambda x: x.replace('\'','').replace(' ','').split(','))
    
    df[col_expl] = df[col_expl].apply(all_possible_pair_excluding_duplicates_unique)
    
    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns})

    index = 0


    for i in range(len(df)):
        for j in range(len(df[col_expl].values[i])):
             
            df_out.loc[index,col_expl] = tuple(sorted(df[col_expl].values[i][j]))
            index +=1
            
    return df_out

In [107]:
df_dish = explode_ingredients(dish_recipe_file,'Ingredients')

df_dish.head()

Unnamed: 0,Dish,Ingredients
0,apple_pie,"(apple, margarine)"
1,apple_pie,"(apple, flour)"
2,apple_pie,"(apple, water)"
3,apple_pie,"(apple, oats)"
4,apple_pie,"(apple, sugar)"


In [175]:
len(df_dish)

17109

In [114]:
df_dish.to_csv('FoodAnalysis/pair_ingredients.csv')

## Pair Ingredients

In [39]:
df_pair_ingredients = pd.read_csv('FoodAnalysis/pair_ingredients.csv')
df_pair_ingredients = df_pair_ingredients[['Dish','Ingredients']]
df_pair_ingredients.head()

Unnamed: 0,Dish,Ingredients
0,apple_pie,"('apple', 'margarine')"
1,apple_pie,"('apple', 'flour')"
2,apple_pie,"('apple', 'water')"
3,apple_pie,"('apple', 'oats')"
4,apple_pie,"('apple', 'sugar')"


#### 1 - Different pairs in a recipe

In [40]:
df_unique_pair_ingredients = df_pair_ingredients.drop_duplicates()
df_unique_pair_ingredients.loc[:,'count'] = 1
df_unique_pair_ingredients[['Dish','count']].groupby(['Dish']).count().rename(columns={"count":"different pairs"})
df_different_pair_ingredients = df_pair_ingredients[['Dish','Ingredients']].merge(df_unique_pair_ingredients[['Dish','count']].groupby(['Dish']).count().rename(columns={"count":"different pairs"}), how='left', left_on='Dish', right_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [51]:
df_unique_pair_ingredients[['Dish','count']].groupby(['Dish']).count().rename(columns={"count":"different pairs"})

Unnamed: 0_level_0,different pairs
Dish,Unnamed: 1_level_1
apple_pie,285
carrot_cake,510
cheesecake,731
chocolate_mousse,176
cupcakes,666
frozen_yogurt,273
panna_cotta,439
red_velvet_cake,368
tiramisu,309
waffles,485


#### 2 - Times a pair appears in the recipe

In [56]:
df_final_ingredients = df_pair_ingredients
df_final_ingredients.loc[:,'count'] = 1
df_final_ingredients = df_different_pair_ingredients.merge(df_final_ingredients.groupby(['Dish','Ingredients']).count().rename(columns={"count":"times of the pair"}), how='left', left_on=['Dish','Ingredients'], right_index=True)
df_final_ingredients = df_final_ingredients.drop_duplicates()
df_final_ingredients.head(100)

Unnamed: 0,Dish,Ingredients,different pairs,times of the pair
0,apple_pie,"('apple', 'margarine')",285,3
1,apple_pie,"('apple', 'flour')",285,33
2,apple_pie,"('apple', 'water')",285,6
3,apple_pie,"('apple', 'oats')",285,1
4,apple_pie,"('apple', 'sugar')",285,30
5,apple_pie,"('apple', 'cornstarch')",285,6
6,apple_pie,"('apple', 'cinnamon')",285,38
7,apple_pie,"('flour', 'margarine')",285,1
8,apple_pie,"('margarine', 'water')",285,1
9,apple_pie,"('margarine', 'oats')",285,1


### Compute the probability

Compute the probability in a frequentist way

In [46]:
def coexistence_probability(df):
    
    df['coexistence_probability'] = df['times of the pair']/df['different pairs']
    
    return df

In [47]:
df_ingredients_coexistence_probability = df_final_ingredients.apply(coexistence_probability, axis=1)
df_ingredients_coexistence_probability = df_ingredients_coexistence_probability[['Dish','Ingredients','coexistence_probability']]
df_ingredients_coexistence_probability.head()

Unnamed: 0,Dish,Ingredients,coexistence_probability
0,apple_pie,"('apple', 'margarine')",0.010526
1,apple_pie,"('apple', 'flour')",0.115789
2,apple_pie,"('apple', 'water')",0.021053
3,apple_pie,"('apple', 'oats')",0.003509
4,apple_pie,"('apple', 'sugar')",0.105263


In [48]:
f = open('FoodAnalysis/dish_ingr_pair_coexistence_probability.txt','w')

f.write('Dish, Ingredients Pair, Coexistence Probability\n')

for elem in df_ingredients_coexistence_probability.values:
    
    f.write(str(((elem[0],elem[1]), elem[2]))+'\n')
    
f.close()

In [49]:
df_ingredients_coexistence_probability.Ingredients[0]

"('apple', 'margarine')"