In [2]:
import numpy as np
import pandas as pd
import sqlite3 as sl3

#Load Data From Database

In [3]:
dbfile = "recipes.db"
conn = sl3.connect(dbfile)
cur = conn.cursor()

In [4]:
# Read in 'ingredients' table
query = "SELECT new_description, recipe_id FROM new_ingredients"
all_ingr_df = pd.read_sql(query, conn)

#Calculate Distance Matrix
Construct a matrix of jaccard distance between each unique ingredient. The distance is based on the similarities/differences of the list of recipes containing each ingredient (i.e. the more recipes that share two ingredients, and the fewer recipes unique to only one of the two, the lower the jaccard distance between the them). 

This matrix will be input to our clustering algorithm.

In [6]:
# for each unique ingredient, generate a list of recipes containing that ingredient
recipes_list = list()
for ingredient in all_ingr_df['new_description'].unique():
    recipes_list.append(all_ingr_df[all_ingr_df['new_description']==ingredient])


In [7]:
def get_jaccard_distance(list1, list2):
    """Calculates jaccard distance between two recipe lists
    this serves as a score of how associated two ingredients are;
    how many recipies are they an ingredient of in common or in contrast
    input: 2 lists of recipe_id's for recipes containing each of two ingredients
    output: a float score of jaccard distance"""
    if len(list1) > len(list2):
        temp = list1
        list1 = list2
        list2 = temp
    
    in_both_count = 0
    
    # loop over every entry in the shorter list and count how
    # many of the recipes are shared between the two
    for recipe_id in list1.recipe_id:
        if len(list2[list2['recipe_id']==recipe_id]) > 0:
            in_both_count += 1
        
    jaccard_similarity = 1.0*in_both_count / (len(list1) + len(list2) - in_both_count)
    jaccard_distance = 1.0-jaccard_similarity
    
    return jaccard_distance
        

In [8]:
#construct the distance matrix

distance_matrix = [[0 for x in range(len(recipes_list))] for x in range(len(recipes_list))]

# loop over every combination of 2 ingredients and calculate
# the jaccard_distance between their recipe_lists
for i in range(len(recipes_list)):
    for j in range(i+1,len(recipes_list)):
        jaccard_distance = get_jaccard_distance(recipes_list[i],recipes_list[j])
        distance_matrix[i][j] = jaccard_distance
        distance_matrix[j][i] = jaccard_distance


In [12]:
np.shape(distance_matrix)

(129, 129)

# Clustering(mean_proportion, stddev_proportion, norm_counts)

In [1]:
from sklearn.cluster import AffinityPropagation

In [14]:
af = AffinityPropagation(affinity='precomputed')
af.fit(distance_matrix)

[17 12 12  0 20 16 24 19  1  9  9 12  2  5 22  4  4  5  4 19  3  3  4  8 15
 21  4  3 13  3  7  4 10 14  6 19  9 21  0 16 13  4  2 11  0 16 18  1  0  5
 21  0 12 20 13  5 18  6 18  7  8  9 24  7  5  2  4 10 11 12 24  9  8 12  2
  7 17  3 13  7 20 10 19  7 11 23  9 14 15 16  3  0  0 12  8  3  0 17 18 21
 14 12 22 22 21  9  5 11 19 14 18 20  7 17 10  2  0 21 17 21 16 22  0 23 24
  4 19 16 20]


In [24]:
labels = af.labels_
labels_unique = np.unique(labels)
n_clusters = len(labels_unique)
print n_clusters, 'Estimated clusters'

for i in range(len(labels_unique)):
    print 'Cluster %d - count:%d - ingredients:'%(i,len(labels[labels==i]))
    for index in np.where(labels==i)[0]:
        print '   %s'%all_ingr_df['new_description'].unique()[index]

25 Estimated clusters
Cluster 0 - count:10 - ingredients:
   apples
   cottage cheese
   banana peppers
   lemon zest
   olives
   flour
   egg
   curry paste
   red wine vinegar
   fennel seed
Cluster 1 - count:2 - ingredients:
   garlic
   sweet potato
Cluster 2 - count:5 - ingredients:
   paprika
   apple sauce
   maple syrup
   kalamata olives
   bay leaf
Cluster 3 - count:7 - ingredients:
   hot sauce
   artichoke hearts
   cream cheese
   manzanilla olives
   onion powder
   chili powder
   cocounut oil
Cluster 4 - count:9 - ingredients:
   tahini
   black beans
   water
   basil
   portabello mushroom
   rosemary
   brown sugar
   cinnamon
   honey
Cluster 5 - count:6 - ingredients:
   green olives
   pumpkin
   white pepper
   sesame seed oil
   beets
   cabbage
Cluster 6 - count:2 - ingredients:
   balsamic vinegar
   oregano
Cluster 7 - count:7 - ingredients:
   sesame seeds
   cucumber
   zucchini
   sweet chili sauce
   thyme
   ground beef
   ranch
Cluster 8 - count:4 - in