# Recipe Recommender Capstone Project

** Task: ** Given a recipe from the existing data set that you have, suggest another recipe by replacing n (random) ingredients. The new recipe should:
- “make sense” according to certain standards of taste
- not be a subset of ingredients of an existing recipe in the dataset

T-shirt bonus:
- Suggest a reasonable vegetarian variant of a meat recipe. This would require some form of clustering to recognize food categories such as vegetables, pulses, meats, etc. 

## Data Cleaning

The data has been scrapped from http://allrecipes.com/. It consists of over 7K recipes and 2.5M reviews.

In [546]:
import json
import numpy as np
import io
import re

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3),min_df=1,stop_words='english')

from bokeh.plotting import figure, output_notebook, show
from bokeh.models import NumeralTickFormatter,ColumnDataSource, LabelSet
output_notebook()

### Removing the Unrelated Words from Ingredients

1. Quantifiers and descriptive words are filtered out from the ingredients.  
2. Each word is lemmatized

In [491]:
#filter out the words that are not ingredients
unrelated = ["","a","an",'teaspoon','oz','cup','tablespoon','gram','kilogram','kg','pound',
             'milliliter','ml','envelope','substitute','chip',
             'ounce','tsp','tbl','tb','tbsp','pint','pt','lb','liter','fluid',"halves","inch",
             "skinless", "boneless","pounded","thickness","thick","cooking","cubed","instant",
             "to","from","unsalted","pinch","chopped",'fresh', 'flat', 'leaf',"packages",
             "minced","cloves","pinched","sliced","tablespoons","taste","all","purpose",
             'teaspoons', 'beaten', 'cups', 'for', 'frying', 'or', 'as', 'needed' ,"ground",
             "large","small","big","cut","half","finely","peeled","cooked","lightly",
             "in","and","or","kosher","halal","ounces","ounces","can","freshly","crushed","dried",
             "slices","slice","diced",'into', 'cubes','divided',"melted","frozen","deveined",
             "optional","cans","canned","grated","pounds","lean","packed","mashed","overripe",
             'quarters', 'vertically','package','tops', 'removed', 'seeded','softened','extra',
             "bottle","condensed","of","style","heavy","torn","piece","shredded","pieces",
             "stewed","chunks","chunk","mix","whole","clove","light","assorted","s","with",
             "food","jar","spray","toasted",'favorite', 'chop','bite',
             'chuck','chop','cleaned','container','covered','degree','delicious','double',
             'medium','prepared',
             'preserve','quick','refrigerated','rinsed','roast','rolled','room','stew',
             'temperature','plus','packet','pack',
             'trimmed','unwrapped','warm','flavored','link','sized','bulk','low',
             'high','sifted','','square','thinly','drained','halved',
             'cube','concentrate','crumb','crumbled','warmed','partially',
             'portion','dissolved','halve','skinned','thin','deboned','boiled',
             'butterflied','cooled','more','defrosted','size','quartered'
            ]

def is_numeric(c):
    """
    Check if a given string is numeric or alphanumeric
    Args:
        c (str): string to check
    Returns: 
        True if the given parameter is numeric
    """
    try:
        float(c)
    except ValueError:
        return False
    else:
        return True 
    
def clean_ingredients(ingredients):
    """
    Filters the words that are not an ingredient but are in the ingredient specifications
    Args:
        ing (str): a line from ingredient list
    Returns:
        ingredient string: if it is considered as an ingredient 
        False: if the given ingredient is filtered as a non-ingredient
    """
    #print ("\n**",ingredients)
    ingredients = list(map(str.lower,re.split("\W+",ingredients)))
    return [wnl.lemmatize(ingredient.strip()) for ingredient in ingredients if not (len(ingredient) <1 or ingredient in unrelated or is_numeric(ingredient))]


In [492]:
#read the recipes
ingredients_dic = []
with open("data/recipes.data","r") as f:

    for line in f:
        recipe  = json.load(io.StringIO(line))
        name = recipe['name']
        num_rating = recipe['num_ratings']
        categories = recipe['categories']
        rid = recipe['id']
        rating = recipe['rating']
        ingredients = [clean_ingredients(_ingredients) for _ingredients in recipe['ingredients'] ]
        ingredients_filtered = [ingredient for ingredient in ingredients if len(ingredient) < 4]
        #print (" id: ",rid,"\n Rating: ",rating,"\n #Ratings: ",num_rating,"\n Name: ",name,"\n Categories: ",categories,"\n Ingredients: ",ingredients
          # ,"\n\n")
        if ingredients_filtered:
            ingredients_dic.append(ingredients_filtered)
        #print (line)
        #break
    #print j["title"]
    #G.add_node(name)
    #for ing in receipe["ingredients"]:
        #print clean_name(ing["ingredientDescription"]["completeText"])
        #G.add_edge(name,ing["ingredientDescription"]["completeText"])

## Ingredient Analysis

Total number of recipes:

In [493]:
len(ingredients_dic)

7048

In [494]:
ingredients_dic

[[['garlic'],
  ['salt'],
  ['parsley'],
  ['butter'],
  ['chicken', 'breast'],
  ['salt', 'pepper'],
  ['flour'],
  ['salt'],
  ['egg'],
  ['panko', 'bread', 'crumb'],
  ['cayenne', 'pepper'],
  ['vegetable', 'oil']],
 [['flour', 'coating'],
  ['salt'],
  ['black', 'pepper'],
  ['oregano'],
  ['chicken', 'breast'],
  ['butter'],
  ['olive', 'oil'],
  ['mushroom'],
  ['marsala', 'wine'],
  ['sherry']],
 [['green', 'bell', 'pepper'],
  ['red', 'bell', 'pepper'],
  ['olive', 'oil'],
  ['pork'],
  ['onion'],
  ['salt', 'pepper'],
  ['dry', 'white', 'wine'],
  ['tomato', 'puree'],
  ['feta', 'cheese'],
  ['white', 'rice'],
  ['raisin'],
  ['pine', 'nut'],
  ['parsley']],
 [['brussels', 'sprout'],
  ['sweet', 'potato'],
  ['red', 'onion'],
  ['garlic'],
  ['dry', 'mustard', 'powder'],
  ['smoked', 'paprika'],
  ['brown', 'sugar'],
  ['cayenne', 'pepper'],
  ['salt'],
  ['black', 'pepper'],
  ['olive', 'oil'],
  ['rosemary']],
 [['bacon'],
  ['shallot'],
  ['chicken', 'breast'],
  ['curry', 

In [495]:
ingredients = [" ".join(ingredient) for _ingredients in ingredients_dic for ingredient in _ingredients]

In [496]:
vec = DictVectorizer()

In [497]:
X = vec.fit_transform(Counter(ingredients)) #get the bag of words
a = X.toarray() #turn it to numpy array
word_counts = a.sum(axis=0)
sorted_indicies = np.argsort(word_counts)[::-1]
word_counts = np.array(list(map(lambda x: int(x),word_counts)))
vocabulary = np.array(vec.get_feature_names())

In [498]:
ingredient_frequency = list(zip(vocabulary[sorted_indicies],word_counts[sorted_indicies]))[:1000]

In [441]:
#sorted(vec.vocabulary_)

In [499]:
popular_ingredients = list(map(lambda x: ( x[0],float( "{0:.2f}".format(x[1] / 7048) ) ), ingredient_frequency[:20]))

In [500]:
popular_ingredients

[('salt', 0.42),
 ('butter', 0.36),
 ('white sugar', 0.3),
 ('egg', 0.28),
 ('flour', 0.27),
 ('onion', 0.23),
 ('garlic', 0.22),
 ('black pepper', 0.18),
 ('water', 0.16),
 ('milk', 0.16),
 ('vanilla extract', 0.16),
 ('olive oil', 0.14),
 ('vegetable oil', 0.14),
 ('brown sugar', 0.13),
 ('salt pepper', 0.11),
 ('cinnamon', 0.1),
 ('baking soda', 0.1),
 ('baking powder', 0.09),
 ('chicken breast', 0.08),
 ('parmesan cheese', 0.08)]

In [531]:
ings = list(map(lambda x: x[0], popular_ingredients))[::-1]
freq = list(map(lambda x: x[1] , popular_ingredients))[::-1]

['8%',
 '8%',
 '9%',
 '10%',
 '10%',
 '11%',
 '13%',
 '14%',
 '14%',
 '16%',
 '16%',
 '16%',
 '18%',
 '22%',
 '23%',
 '27%',
 '28%',
 '30%',
 '36%',
 '42%']

In [558]:
p = figure(title="Most Popular Ingredients", y_range=ings, x_range=[0,1])
p.segment(0, ings, freq, ings, line_width=2, line_color="green", )
p.circle(freq, ings, size=15, fill_color="orange", line_color="green", line_width=3, )
p.xaxis[0].formatter = NumeralTickFormatter(format="0%")

source = ColumnDataSource(data=dict(height=ings,
                                    weight=freq,
                                    names=list(map(lambda x: "{0:}%".format(int(x * 100)),freq))))
labels = LabelSet(x='weight', y='height', text='names', level='glyph',text_font_size="7pt",
              x_offset=5, y_offset=5, source=source, render_mode='canvas')
p.add_layout(labels)
show(p)