In [2]:
import numpy as np
import pandas as pd
import sqlite3 as sl3

#Load Data From Database

In [3]:
dbfile = "recipes.db"
conn = sl3.connect(dbfile)
cur = conn.cursor()

In [4]:
# Read in 'ingredients' table
query = "SELECT new_description, recipe_id FROM new_ingredients"
all_ingr_df = pd.read_sql(query, conn)

#Calculate Distance Matrix
Construct a matrix of jaccard similarity between each unique ingredient. The similarity is based on the similarities/differences of the list of recipes containing each ingredient (i.e. the more recipes that share two ingredients, and the fewer recipes unique to only one of the two, the higher the jaccard similarity between the them). 

This matrix will be input to our clustering algorithm.

In [30]:
# for each unique ingredient, generate a list of recipes containing that ingredient
recipes_list = list()
for ingredient in all_ingr_df['new_description'].unique():
    recipes_list.append(all_ingr_df[all_ingr_df['new_description']==ingredient])


In [31]:
def get_jaccard_similarity(list1, list2):
    """Calculates jaccard distance between two recipe lists
    this serves as a score of how associated two ingredients are;
    how many recipies are they an ingredient of in common or in contrast
    input: 2 lists of recipe_id's for recipes containing each of two ingredients
    output: a float score of jaccard distance"""
    if len(list1) > len(list2):
        temp = list1
        list1 = list2
        list2 = temp
    
    in_both_count = 0
    
    # loop over every entry in the shorter list and count how
    # many of the recipes are shared between the two
    for recipe_id in list1.recipe_id:
        if len(list2[list2['recipe_id']==recipe_id]) > 0:
            in_both_count += 1
        
    jaccard_similarity = 1.0*in_both_count / (len(list1) + len(list2) - in_both_count)
    #jaccard_distance = 1.0-jaccard_similarity
    
    return jaccard_similarity
        

In [32]:
#construct the similarity matrix

similarity_matrix = [[0 for x in range(len(recipes_list))] for x in range(len(recipes_list))]

# loop over every combination of 2 ingredients and calculate
# the jaccard_distance between their recipe_lists
for i in range(len(recipes_list)):
    for j in range(i+1,len(recipes_list)):
        jaccard_similarity = get_jaccard_similarity(recipes_list[i],recipes_list[j])
        similarity_matrix[i][j] = jaccard_similarity
        similarity_matrix[j][i] = jaccard_similarity


# Do clustering based on jaccard similarity

In [33]:
from sklearn.cluster import AffinityPropagation

In [34]:
af = AffinityPropagation(affinity='precomputed')
af.fit(distance_matrix)

AffinityPropagation(affinity='precomputed', convergence_iter=15, copy=True,
          damping=0.5, max_iter=200, preference=None, verbose=False)

In [35]:
labels = af.labels_
labels_unique = np.unique(labels)
n_clusters = len(labels_unique)
print n_clusters, 'Estimated clusters'

for i in range(len(labels_unique)):
    print 'Cluster %d - count:%d - ingredients:'%(i,len(labels[labels==i]))
    for index in np.where(labels==i)[0]:
        print '   %s'%all_ingr_df['new_description'].unique()[index]

25 Estimated clusters
Cluster 0 - count:6 - ingredients:
   cheddar cheese
   bread
   rosemary
   onion
   ranch
   honey
Cluster 1 - count:25 - ingredients:
   garbonzo beans
   olive oil
   lemon juice
   garlic
   salt
   cumin
   paprika
   tahini
   water
   basil
   parsley
   manzanilla olives
   black pepper
   white beans
   red onion
   baking soda
   zucchini
   beets
   eggplant
   egg
   arugula
   wasabi
   chipotle pepper
   hard-boiled egg
   sage
Cluster 2 - count:3 - ingredients:
   green olives
   black beans
   yogurt
Cluster 3 - count:3 - ingredients:
   portabello mushroom
   cream cheese
   serrano peppers
Cluster 4 - count:5 - ingredients:
   cayenne pepper
   cilantro
   jalapeno
   banana peppers
   orange juice
Cluster 5 - count:7 - ingredients:
   pumpkin
   balsamic vinegar
   cottage cheese
   flatbread
   brown sugar
   apple sauce
   bacon
Cluster 6 - count:4 - ingredients:
   sweet potato
   lemon zest
   white pepper
   coriander
Cluster 7 - count:3 -