# TasteBud: GAN Based Recipe Generation with Graph

Introductory words about this project...

----
## Data Processing

Data processing here has two main goals each with smaller milestones: tokenizing recipe data and creating the ingredients graph.
Tokenizing data requires parsing the RecipeNGL dataset, which will be subsetted due to its large size.
Creating the ingredients graph first requires a list of ingredients. A raw list will be obtained from the What's Cooking and RecipeNGL datasets. Then, the list will be filtered into a smaller list. The filtered list will be used for indexing ingredients, finding related/close ingredients, and creating the graph.

In [146]:
import csv
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torchtext
import pandas as pd
import matplotlib.pyplot as plt
import os.path
import json
import ast
import glob
import re
import string

In [95]:
pre_processing = True
glove = torchtext.vocab.GloVe(name='6B', dim=50)

### What's Cooking Data

In [96]:
# loading the What's Cooking dataset from the .json file
wc_train_path = './data/whats_cooking/train.json'
wc_train_data = json.load(open(wc_train_path, 'r'))
print(wc_train_data[0])

{'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


In [97]:
if pre_processing == True:
    # creating a list of unique ingredients from the datasets
    ingredients_set = set()

    for i in range(len(wc_train_data)):
        ingredients_set = ingredients_set | set(wc_train_data[i]['ingredients'])
    print(list(ingredients_set)[0:25])

['mustard sauce', 'yams', 'blanco tequila', 'back bacon rashers', 'gluten-free flour', 'whole wheat rotini pasta', 'barbecue rub', 'chili con carne', 'hazelnut flour', 'celery', 'anchovy fillets', 'dry milk powder', 'hard salami', 'cut up chicken', 'wish bone guacamol ranch dress', 'hot pork sausage', 'San Marzano tomatoes', 'low-fat balsamic vinaigrette', "Quorn Chik''n Tenders", 'curry leaves', 'sloe gin', 'saffron powder', 'passata', 'red curry paste', 'low sodium store bought chicken stock']


### Data Subsetting
Since RecipeNGL contains 2.23 million recipes, select a subset to use:

In [98]:
full_ngl_path = './data/recipe_ngl/full_dataset.csv'
# if the full recipeNGL dataset csv file exists, read a subset of it
if os.path.exists(full_ngl_path):
    ngl_subset = [0, 5000]
    ngl_df = pd.read_csv(full_ngl_path, skiprows=ngl_subset[0], nrows=ngl_subset[1], index_col=0)
    ngl_df.to_csv(f'./data/recipe_ngl/dataset_{ngl_subset[0]}_{ngl_subset[1]}.csv')

for file in glob.glob('./data/recipe_ngl/dataset*.csv'):
    print(file)
    ngl_df = pd.read_csv(file, index_col=0, 
                         converters={'ingredients':pd.eval, 'directions':pd.eval, 'NER':pd.eval})

./data/recipe_ngl\dataset_0_100.csv
./data/recipe_ngl\dataset_0_5000.csv


In [6]:
ngl_df[:3]

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[In a heavy 2-quart saucepan, mix brown sugar,...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[Place chipped beef on bottom of baking dish.,...",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[frozen corn, cream cheese, butter, garlic pow..."


### Create List of Ingredients

In [7]:
if pre_processing == True:
    for i in range(len(ngl_df["NER"])):
        ingredients_set = ingredients_set | set(ngl_df["NER"][i])
    print(list(ingredients_set)[0:50])

['mustard sauce', 'back bacon rashers', 'gluten-free flour', 'whole wheat rotini pasta', 'red raspberry jello', 'hazelnut flour', 'celery', 'anchovy fillets', 'San Marzano tomatoes', 'wish bone guacamol ranch dress', 'low-fat balsamic vinaigrette', "Quorn Chik''n Tenders", 'sloe gin', 'saffron powder', 'passata', 'red curry paste', 'low sodium store bought chicken stock', 'fresno pepper', 'leaf parsley', "Campbell's cream", 'brown basmati rice', 'gumbo file powder', 'amaretti', 'reduced fat coconut milk', 'condensed reduced fat reduced sodium cream of chicken soup', 'frozen mustard greens', 'pineapple pie filling', 'king oyster mushroom', 'linguine pasta', 'Tabasco sauce', 'flour tortillas (not low fat)', 'asian noodles', 'low-fat parmesan cheese', 'sausage casings', 'new york strip steaks', 'beef rib roast', 'apple juice', 'cherry jello', 'White Pepper', 'caramel icing', 'Tia Maria', 'meat sauce', 'baby okra', 'green garlic', 'onion buns', 'white tuna', 'dry vermouth', 'small potatoes

In [8]:
if pre_processing == True:
    cw = csv.writer(open("data/raw_ingredients_list.csv",'w'))
    cw.writerow(list(ingredients_set))
    pd.read_csv('data/raw_ingredients_list.csv', header=None).T.to_csv('data/raw_ingredients_list_transpose.csv', header=False, index=False)

From a list of 8500 ingredients, we manually removed duplicated items that had spelling errors, semantic similarities, and extra numeric or qualitative descriptors (e.g. chopped, 2% fat, shredded, unsweetened), giving 955 ingredients. Then, each is paired its gloVe embedding.

In [99]:
# param: token_list - a list of tokens, no spaces or symbols
# return: a tensor of the averaged GloVe embeddings of each token
def glove_average(token_list):
    embeds_list = []
    for token in token_list:
        embeds_list.append(glove[token])
    embeds_average = torch.mean(torch.stack(embeds_list), dim=0)
    return embeds_average

In [106]:
if pre_processing == True:
    filtered_ingredients_df = pd.read_csv('data/filtered_ingredients_list_transpose.csv', header=None, names=["ingredient"])
    ingredient_embeddings = []

    for i, row in filtered_ingredients_df.iterrows():
        token_list = re.sub(r"[^a-zA-Z ]+", '', filtered_ingredients_df['ingredient'][i].lower()).split(' ')
        embed_list = []
        ingredient_embeddings.append(glove_average(token_list).tolist())
        
    filtered_ingredients_df['embedding'] = ingredient_embeddings
    filtered_ingredients_df.to_csv('data/glove_ingredients_list.csv', header=False, index=False)

In [107]:
ingredients_df = pd.read_csv('data/glove_ingredients_list.csv', header=None, names=["ingredient", "embedding"],
                         converters={'embedding':pd.eval})
ingredients_df['ingredient'] = ingredients_df['ingredient'].apply(lambda x: x.rstrip())

In [108]:
ingredients_df[:3]

Unnamed: 0,ingredient,embedding
0,aioli,"[0.053968001157045364, 0.027247000485658646, -..."
1,ale,"[-0.4636099934577942, 0.6578099727630615, -1.3..."
2,almond,"[-0.023429999127984047, 0.47051000595092773, -..."


Since GloVe give embeddings for out-of-vocabulary words, the words with null tensors `[0,0,0,...,0]` were removed from the ingredients list, giving a total of 840 ingredients. Using these embeddings will allow for ingredients not in the list to be mapped to the closest ingredient, then put in the ingredient graph.

In [109]:
# dictionary pairing ingredient with index value
ingredient_index_dict = {}
for i in range(ingredients_df.shape[0]):
    if ingredients_df['ingredient'][i] not in ingredient_index_dict:
        ingredient_index_dict[ ingredients_df['ingredient'][i] ] = i

### Recipe Data Cleaning

In [133]:
# filter for only "Gathered" sources, since those have more consistent format
gathered_ngl_df = ngl_df[ngl_df.source == 'Gathered']
# remove unnecessary columns
filtered_ngl_df = gathered_ngl_df[['title', 'ingredients', 'directions', 'NER']]

In [134]:
filtered_ngl_df[:4]

Unnamed: 0,title,ingredients,directions,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[In a heavy 2-quart saucepan, mix brown sugar,...","[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[Place chipped beef on bottom of baking dish.,...","[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...","[frozen corn, cream cheese, butter, garlic pow..."
3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[Boil and debone chicken., Put bite size piece...","[chicken, chicken gravy, cream of mushroom sou..."


In [135]:
# removing uppercase and symbols from directions and fuse using '\n'
for i, row in filtered_ngl_df.iterrows():
    cleaned_directions = []
    for step in row.directions:
        step = re.sub(r"[^a-zA-Z0-9]+", ' ', step.lower()) # remove uppercase and symbols
        cleaned_directions.append(step)
    row.directions = cleaned_directions

In [136]:
filtered_ngl_df[:4]

Unnamed: 0,title,ingredients,directions,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...",[in a heavy 2 quart saucepan mix brown sugar n...,"[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[place chipped beef on bottom of baking dish ,...","[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...",[in a slow cooker combine all ingredients cove...,"[frozen corn, cream cheese, butter, garlic pow..."
3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[boil and debone chicken , put bite size piece...","[chicken, chicken gravy, cream of mushroom sou..."


### Recipe Data Tokenizing

In [137]:
tokenized_titles = []
tokenized_ingredients = []
tokenized_directions = []
tokenized_NER = []

for i, row in filtered_ngl_df.iterrows():
    # tokenize titles
    tokens_list = filtered_ngl_df.title.values[i].split(' ')
    tokenized_titles.append(tokens_list)
    
    # tokenize ingredients
    tokens_list = []
    for ingredient_item in row.ingredients:
        tokens_list.append(ingredient_item.split(' '))
    tokenized_ingredients.append(tokens_list)
    
    # tokenize directions
    tokens_list = []
    for direction_item in row.directions:
        tokens_list.append(direction_item.split(' '))
    tokenized_directions.append(tokens_list)
    
    # tokenize ingredients
    tokens_list = []
    for NER_item in row.NER:
        tokens_list.append( re.sub(r"[^a-zA-Z ]+", '', NER_item.lower()).split(' ') )
    tokenized_NER.append(tokens_list)

print(tokenized_titles[0])
print(tokenized_ingredients[0])
print(tokenized_directions[0])
print(tokenized_NER[0])

['No-Bake', 'Nut', 'Cookies']
[['1', 'c.', 'firmly', 'packed', 'brown', 'sugar'], ['1/2', 'c.', 'evaporated', 'milk'], ['1/2', 'tsp.', 'vanilla'], ['1/2', 'c.', 'broken', 'nuts', '(pecans)'], ['2', 'Tbsp.', 'butter', 'or', 'margarine'], ['3', '1/2', 'c.', 'bite', 'size', 'shredded', 'rice', 'biscuits']]
[['in', 'a', 'heavy', '2', 'quart', 'saucepan', 'mix', 'brown', 'sugar', 'nuts', 'evaporated', 'milk', 'and', 'butter', 'or', 'margarine', ''], ['stir', 'over', 'medium', 'heat', 'until', 'mixture', 'bubbles', 'all', 'over', 'top', ''], ['boil', 'and', 'stir', '5', 'minutes', 'more', 'take', 'off', 'heat', ''], ['stir', 'in', 'vanilla', 'and', 'cereal', 'mix', 'well', ''], ['using', '2', 'teaspoons', 'drop', 'and', 'shape', 'into', '30', 'clusters', 'on', 'wax', 'paper', ''], ['let', 'stand', 'until', 'firm', 'about', '30', 'minutes', '']]
[['brown', 'sugar'], ['milk'], ['vanilla'], ['nuts'], ['butter'], ['bite', 'size', 'shredded', 'rice', 'biscuits']]


In [138]:
tokenized_ngl_df = filtered_ngl_df.copy()
tokenized_ngl_df['token_title'] = tokenized_titles
tokenized_ngl_df['token_ingredients'] = tokenized_ingredients
tokenized_ngl_df['token_directions'] = tokenized_directions
tokenized_ngl_df['token_NER'] = tokenized_NER

In [139]:
tokenized_ngl_df[:3]

Unnamed: 0,title,ingredients,directions,NER,token_title,token_ingredients,token_directions,token_NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...",[in a heavy 2 quart saucepan mix brown sugar n...,"[brown sugar, milk, vanilla, nuts, butter, bit...","[No-Bake, Nut, Cookies]","[[1, c., firmly, packed, brown, sugar], [1/2, ...","[[in, a, heavy, 2, quart, saucepan, mix, brown...","[[brown, sugar], [milk], [vanilla], [nuts], [b..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[place chipped beef on bottom of baking dish ,...","[beef, chicken breasts, cream of mushroom soup...","[Jewell, Ball'S, Chicken]","[[1, small, jar, chipped, beef,, cut, up], [4,...","[[place, chipped, beef, on, bottom, of, baking...","[[beef], [chicken, breasts], [cream, of, mushr..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...",[in a slow cooker combine all ingredients cove...,"[frozen corn, cream cheese, butter, garlic pow...","[Creamy, Corn]","[[2, (16, oz.), pkg., frozen, corn], [1, (8, o...","[[in, a, slow, cooker, combine, all, ingredien...","[[frozen, corn], [cream, cheese], [butter], [g..."


### Direction Step Combining
To ease the complexity of the GAN and RNN, the list of directions for each recipe will be fused into a single list of words, with each step separated by `'\n'` (note all prior symbols were removed). The end of the directions will be marked by `'<EOS>'` for "End of String."

In [156]:
combined_directions = []
for i, row in tokenized_ngl_df.iterrows():
    direction = []
    for step in row.token_directions:
        for word in step:
            if word == '': # skip empty words
                continue
            direction.append(word)
        direction.append('\n')
    direction.append('<EOS>')
    combined_directions.append(direction)
     
tokenized_ngl_df['combined_directions'] = combined_directions
print(combined_directions[8])

['roll', 'steak', 'strips', 'in', 'flour', '\n', 'brown', 'in', 'skillet', '\n', 'salt', 'and', 'pepper', '\n', 'combine', 'tomato', 'liquid', 'water', 'onions', 'and', 'browned', 'steak', 'cover', 'and', 'simmer', 'for', 'one', 'and', 'a', 'quarter', 'hours', '\n', 'uncover', 'and', 'stir', 'in', 'worcestershire', 'sauce', '\n', 'add', 'tomatoes', 'green', 'peppers', 'and', 'simmer', 'for', '5', 'minutes', '\n', 'serve', 'over', 'hot', 'cooked', 'rice', '\n', '<EOS>']


### Direction Word Vocabulary

In [168]:
# calculate word frequency from directions and ingredients list (NER)
directions_word_freq = {}
for i, row in tokenized_ngl_df.iterrows():
    for word in row.combined_directions:
        directions_word_freq[word] = directions_word_freq.get(word, 0) + 1

    for ingredient_list in row.token_NER:
        for ingredient in ingredient_list:
            directions_word_freq[word] = directions_word_freq.get(word, 0) + 1
        
minimum_word_freq = 2
vocab = set()
for word, freq in directions_word_freq.items():
    if directions_word_freq[word] >= minimum_word_freq:
        vocab.add(word)

print('Unique words before ignoring:', len(directions_word_freq))
print('Unique words after ignoring:', len(vocab))

vocab = list(vocab)
vocab_stoi = {s: i for i, s in enumerate(vocab)}
vocab_itos = {i: s for i, s in enumerate(vocab)}
vocab_size = len(vocab)

print(vocab)

Unique words before ignoring: 3499
Unique words after ignoring: 2393
['raisin', 'fitted', 'yams', 'snack', 'wish', 'chutney', 'service', 'hunger', 'pizzelle', 'celery', 'prayer', 'dot', 'headspace', 'chill', 'sprayed', 'faith', 'uv', 'stockpot', 'puddings', '16', 'overnight', 'rim', 'vegies', 'coals', 'atop', 'pepperidge', 'sifting', 'prawns', 'results', 'rind', 'teflon', 'nicely', 'nice', 'caramel', 'sear', 'gets', 'bake', 'water', 'hours', 'knead', 'sprouts', 'dip', 'pumpernickel', 'salads', 'bits', 'reynolds', 'soups', 'tart', 'slaw', 'rolled', 'filled', 'splenda', 'putting', 'cilantro', 'free', 'showers', 'treat', 'veggies', 'cokes', 'clear', 'rosemary', 'halibut', 'chafing', 'sterile', 'directions', 'sealed', 'saving', 'minutes', 'wilted', 'consistency', 'speed', 'coarse', 'combined', '0', 'liquids', 'fill', 'slice', 'flute', 'tin', 'sure', 'tartar', 'seconds', 'crackers', 'burritos', 'spreading', 'vermouth', 'fist', 'starter', 'half', 'pull', 'sticks', 'pitcher', 'kiwi', 'couple'

### Direction Character Vocabulary
For the baseline model, a character based vocabulary will be used instead

In [186]:
char_vocab = list(string.ascii_lowercase) + list(string.digits) + [' ', '\n', '<EOS>']
char_vocab_stoi = {s: i for i, s in enumerate(char_vocab)}
char_vocab_itos = {i: s for i, s in enumerate(char_vocab)}
char_vocab_size = len(vocab)
print(char_vocab)
print(char_vocab_stoi)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ', '\n', '<EOS>']
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '0': 26, '1': 27, '2': 28, '3': 29, '4': 30, '5': 31, '6': 32, '7': 33, '8': 34, '9': 35, ' ': 36, '\n': 37, '<EOS>': 38}


### Token to Index

In [171]:
indexed_directions = []
for i, row in tokenized_ngl_df.iterrows():
    direction = []
    for word in row.combined_directions:
        if word in vocab:
            direction.append(vocab_stoi[word])
    indexed_directions.append(direction)
     
print(indexed_directions[8])

[311, 984, 1135, 187, 302, 134, 1718, 187, 887, 134, 739, 1723, 441, 134, 1403, 764, 1357, 37, 1177, 1723, 1287, 984, 1167, 1723, 1413, 244, 1388, 1723, 1529, 1328, 38, 134, 2122, 1723, 1957, 187, 1557, 1256, 134, 2274, 1562, 905, 589, 1723, 1413, 244, 2110, 67, 134, 940, 2252, 1021, 2008, 2173, 134, 2344]


In [172]:
indexed_ingredients = []
for i, row in tokenized_ngl_df.iterrows():
    ingredients = []
    for ingredient_list in row.token_NER:
        for word in ingredient_list:
            if word in vocab:
                ingredients.append(vocab_stoi[word])
    indexed_ingredients.append(ingredients)
     
print(indexed_ingredients[8])

[1562, 37, 1177, 1557, 1256, 905, 589, 2100]


In [173]:
indexed_ngl_df = pd.DataFrame({'directions': indexed_directions, 'ingredients': indexed_ingredients})

In [174]:
indexed_ngl_df[:3]

Unnamed: 0,directions,ingredients
0,"[187, 1529, 1158, 897, 2245, 1786, 792, 1718, ...","[1718, 1351, 1757, 407, 1909, 105, 1938, 584, ..."
1,"[140, 2188, 257, 951, 890, 922, 845, 137, 134,...","[257, 1029, 1762, 1742, 922, 1787, 693, 1325, ..."
2,"[187, 1529, 1590, 2273, 1403, 1292, 2364, 1167...","[2310, 1454, 1742, 728, 105, 656, 1390, 739, 441]"


### Token to Character Index

In [196]:
char_indexed_directions = []
for i, row in tokenized_ngl_df.iterrows():
    direction = []
    for word in row.combined_directions:
        if word in ['\n', '<EOS>']:
            direction.append(char_vocab_stoi[word])
        else:
            for char in word:
                if char in char_vocab:
                    direction.append(char_vocab_stoi[char])
            direction.append(char_vocab_stoi[' '])
        
    char_indexed_directions.append(direction)
     
print(char_indexed_directions[8])
for char in char_indexed_directions[8]:
    print(char_vocab_itos[char], end = '')

[17, 14, 11, 11, 36, 18, 19, 4, 0, 10, 36, 18, 19, 17, 8, 15, 18, 36, 8, 13, 36, 5, 11, 14, 20, 17, 36, 37, 1, 17, 14, 22, 13, 36, 8, 13, 36, 18, 10, 8, 11, 11, 4, 19, 36, 37, 18, 0, 11, 19, 36, 0, 13, 3, 36, 15, 4, 15, 15, 4, 17, 36, 37, 2, 14, 12, 1, 8, 13, 4, 36, 19, 14, 12, 0, 19, 14, 36, 11, 8, 16, 20, 8, 3, 36, 22, 0, 19, 4, 17, 36, 14, 13, 8, 14, 13, 18, 36, 0, 13, 3, 36, 1, 17, 14, 22, 13, 4, 3, 36, 18, 19, 4, 0, 10, 36, 2, 14, 21, 4, 17, 36, 0, 13, 3, 36, 18, 8, 12, 12, 4, 17, 36, 5, 14, 17, 36, 14, 13, 4, 36, 0, 13, 3, 36, 0, 36, 16, 20, 0, 17, 19, 4, 17, 36, 7, 14, 20, 17, 18, 36, 37, 20, 13, 2, 14, 21, 4, 17, 36, 0, 13, 3, 36, 18, 19, 8, 17, 36, 8, 13, 36, 22, 14, 17, 2, 4, 18, 19, 4, 17, 18, 7, 8, 17, 4, 36, 18, 0, 20, 2, 4, 36, 37, 0, 3, 3, 36, 19, 14, 12, 0, 19, 14, 4, 18, 36, 6, 17, 4, 4, 13, 36, 15, 4, 15, 15, 4, 17, 18, 36, 0, 13, 3, 36, 18, 8, 12, 12, 4, 17, 36, 5, 14, 17, 36, 31, 36, 12, 8, 13, 20, 19, 4, 18, 36, 37, 18, 4, 17, 21, 4, 36, 14, 21, 4, 17, 36, 7, 14, 1

In [202]:
char_indexed_ingredients = []
for i, row in tokenized_ngl_df.iterrows():
    ingredients = []
    for ingredient_list in row.token_NER:
        for word in ingredient_list:
            if word in ['\n', '<EOS>']:
                ingredients.append(char_vocab_stoi[word])
            else:
                for char in word:
                    if char in char_vocab:
                        ingredients.append(char_vocab_stoi[char])
                ingredients.append(char_vocab_stoi['\n'])
    char_indexed_ingredients.append(ingredients)
     
print(char_indexed_ingredients[8])
for char in char_indexed_ingredients[8]:
    print(char_vocab_itos[char], end = '')

[19, 14, 12, 0, 19, 14, 4, 18, 37, 22, 0, 19, 4, 17, 37, 14, 13, 8, 14, 13, 18, 37, 22, 14, 17, 2, 4, 18, 19, 4, 17, 18, 7, 8, 17, 4, 37, 18, 0, 20, 2, 4, 37, 6, 17, 4, 4, 13, 37, 15, 4, 15, 15, 4, 17, 18, 37, 14, 8, 11, 37]
tomatoes
water
onions
worcestershire
sauce
green
peppers
oil


In [203]:
char_indexed_ngl_df = pd.DataFrame({'directions': char_indexed_directions, 'ingredients': char_indexed_ingredients})

In [204]:
char_indexed_ngl_df[:3]

Unnamed: 0,directions,ingredients
0,"[8, 13, 36, 0, 36, 7, 4, 0, 21, 24, 36, 28, 36...","[1, 17, 14, 22, 13, 37, 18, 20, 6, 0, 17, 37, ..."
1,"[15, 11, 0, 2, 4, 36, 2, 7, 8, 15, 15, 4, 3, 3...","[1, 4, 4, 5, 37, 2, 7, 8, 2, 10, 4, 13, 37, 1,..."
2,"[8, 13, 36, 0, 36, 18, 11, 14, 22, 36, 2, 14, ...","[5, 17, 14, 25, 4, 13, 37, 2, 14, 17, 13, 37, ..."


### Closest Ingredients

In [114]:
# param: ingredient - list of strings of an ingredient (tokenized)
#        ingredient_df - dataframe containing ingredient vocabulary and
#                        their corresponding GloVe embedding
# return: string of closest ingredient in vocabulary
#         if none (e.g. ingredient has is OOV in GloVe), returns empty string
def get_closest_ingredient (ingredient, ingredients_df):
    # check if ingredient is in ingredients_df
    if len(ingredient) == 1 and ingredient[0] in ingredients_df['ingredient'].values:
        return ingredient[0]
    
    closest_ingredient = ''
    smallest_distance = float('inf')
    
    # compute the GloVe embedding of the ingredient
    ingredient_embedding = glove_average(ingredient)
    
    if torch.count_nonzero(ingredient_embedding) == 0:
        return ''
    
    # compute distances between embeddings, choose the smallest distance
    for _, row in ingredients_df.iterrows():
        difference = ingredient_embedding - torch.FloatTensor(row['embedding'])
        distance = torch.sum(torch.square(difference))
        if distance < smallest_distance:
            smallest_distance = distance
            closest_ingredient = row['ingredient']
    
    return closest_ingredient

In [115]:
if pre_processing == True:
    # changes the ingredients list into their corresponding closest ingredients
    # in the vocabulary using the GloVe embeddings
    NER_closest_ingredients = []

    for i, row in tokenized_ngl_df.iterrows():
        NER_closest_list = []
        for ingredient_tokens in row.token_NER:
            NER_closest_list.append( get_closest_ingredient(ingredient_tokens, ingredients_df) )
        NER_closest_ingredients.append(NER_closest_list)

    print(NER_closest_ingredients[0])

['brown sugar', 'milk', 'vanilla', 'soy nut', 'butter', 'rice paper']


In [116]:
if pre_processing == True:
    processed_ngl_df = tokenized_ngl_df.copy()
    processed_ngl_df['closest_ingredients'] = NER_closest_ingredients
    processed_ngl_df.to_csv('data/processed_ngl.csv', header=True, index=True)

In [117]:
processed_ngl_df = pd.read_csv('data/processed_ngl.csv', index_col=0,
                               converters={'ingredients':pd.eval, 'directions':pd.eval, 'NER':pd.eval, 
                                           'token_title':pd.eval, 'token_ingredients':pd.eval, 
                                           'token_directions':pd.eval,' token_NER':pd.eval, 'closest_ingredients':pd.eval})

In [118]:
processed_ngl_df[["token_NER", "closest_ingredients"]][:7]

Unnamed: 0,token_NER,closest_ingredients
0,"[['brown', 'sugar'], ['milk'], ['vanilla'], ['...","[brown sugar, milk, vanilla, soy nut, butter, ..."
1,"[['beef'], ['chicken', 'breasts'], ['cream', '...","[beef, chicken, passion fruit, sour cream]"
2,"[['frozen', 'corn'], ['cream', 'cheese'], ['bu...","[corn, cheese, butter, garlic, salt, pepper]"
3,"[['chicken'], ['chicken', 'gravy'], ['cream', ...","[chicken, chicken, passion fruit, cheese]"
4,"[['peanut', 'butter'], ['graham', 'cracker', '...","[peanut butter, graham cracker, butter, sugar,..."
5,"[['baking', 'potatoes'], ['extra', 'lean', 'gr...","[baking mix, crescent roll, butter, milk, salt..."
6,"[['sugar'], ['butter'], ['egg'], ['buttermilk'...","[sugar, butter, egg, buttermilk, flour, salt, ..."


In [120]:
# outputs the ingredient to closest ingredient pairings for What's Cooking data
if pre_processing == True:
    wc_ingredients, wc_closest_ingredients = [], []
    
    limit = 7500
    for i in range(len(wc_train_data)):
        wc_ingredients.append(wc_train_data[i]['ingredients'])
        
        item_closest_ingredients = []
        for ingredient in wc_train_data[i]['ingredients']:
            token_list = re.sub(r"[^a-zA-Z ]+", '', ingredient.lower()).split(' ')
            closest_ingredient = get_closest_ingredient(token_list, ingredients_df)
            item_closest_ingredients.append(closest_ingredient)
        wc_closest_ingredients.append(item_closest_ingredients)
        
        if i >= limit:
            break

    print(wc_closest_ingredients[0])
    
    wc_ingredients_df = pd.DataFrame({'ingredients': wc_ingredients, 'closest_ingredients': wc_closest_ingredients})
    wc_ingredients_df.to_csv('data/whats_cooking/closest_ingredients.csv', header=True, index=True)

['lettuce', 'black pepper', 'grape', 'garlic', 'pepper', 'onion', 'seasoning', 'lentils', 'gorgonzola']


### Ingredient Frequency

In [84]:
if pre_processing == True:
    # get frequency based on appearences in RecipeNGL recipes
    ingredient_frequency = torch.zeros(len(ingredients_df)).tolist()

    for i in range(processed_ngl_df.shape[0]):
        for closest_ingredient in processed_ngl_df["closest_ingredients"][i]:
            index = ingredient_index_dict.get(closest_ingredient)
            if (index != None):
                ingredient_frequency[index] += 1
    

In [85]:
if pre_processing == True:
    # get frequency based on appearences in What's Cooking recipes
    wc_ingredients_df = pd.read_csv('data/whats_cooking/closest_ingredients.csv', index_col=0,
                                    converters={'ingredients':pd.eval, 'closest_ingredients':pd.eval})
    
    for i in range(wc_ingredients_df.shape[0]):
        for closest_ingredient in wc_ingredients_df["closest_ingredients"][i]:
            index = ingredient_index_dict.get(closest_ingredient)
            if (index != None):
                ingredient_frequency[index] += 1
    """
    wc_limit = 5000
    for i in range(len(wc_train_data)):
        for ingredient in wc_train_data[i]['ingredients']:
            token_list = re.sub(r"[^a-zA-Z ]+", '', ingredient.lower()).split(' ')
            closest_ingredient = get_closest_ingredient(token_list, ingredients_df)
            index = ingredient_index_dict.get(closest_ingredient)
            if (index != None):
                ingredient_frequency[index] += 1
                
        if i > wc_limit:
            break
    """

In [86]:
if pre_processing == True:
    ingredient_frequency = ( torch.FloatTensor(ingredient_frequency) / max(ingredient_frequency) ).tolist()
    ingredients_frequency_df = (ingredients_df.copy()).drop('embedding', axis=1)
    ingredients_frequency_df['frequency'] = ingredient_frequency
    ingredients_frequency_df.to_csv('data/ingredients_frequency.csv', header=False, index=False)

In [88]:
ingredients_frequency_df = pd.read_csv('data/ingredients_frequency.csv', header=None, names=["ingredient", "frequency"])

In [89]:
ingredients_frequency_df[:10]

Unnamed: 0,ingredient,frequency
0,abalone,0.000234
1,absinthe,0.0
2,acai,0.0
3,acorn,0.000234
4,adobo,0.0
5,agar,0.0
6,aioli,0.00117
7,albacore,0.0
8,alcohol,0.000468
9,ale,0.007959


Using the ingredient frequency, we further filter the ingredients with less than 0.1% frequency. This yields 381 ingredients.

In [105]:
if pre_processing == True:
    filtered_ingredients_frequency_df = ingredients_frequency_df[ingredients_frequency_df['frequency'] > 0.001]
    print(filtered_ingredients_frequency_df.shape[0])
    filtered_ingredients_frequency_df.drop('frequency', axis=1).to_csv('data/frequency_filtered_ingredients.csv', header=False, index=False)

381


### Ingredient Graph

To create the graph, each time an ingredient appears in a recipe with another ingredient, their compatibility is increased. This is implemented with an adjacency matrix. The compatibilities will be normalized by each row of the matrix.

In [None]:
if pre_processing == True:
    ingredient_graph = torch.zeros(len(ingredients_df), len(ingredients_df)).tolist()

    for i in range(processed_ngl_df.shape[0]):
        for ingredient_1 in processed_ngl_df["closest_ingredients"][i]:
            index_1 = ingredient_index_dict[ingredient_1]
            for ingredient_2 in processed_ngl_df["closest_ingredients"][i]:
                index_2 = ingredient_index_dict[ingredient_2]
                
                ingredient_graph[index_1][index_2] += 1
    
    ingredient_graph = ( torch.FloatTensor(ingredient_graph) / max(ingredient_graph) ).tolist()

----
## Primary Model

### Ingredient Selector

### GAN RNN

### Training

----
## Baseline Model

Model: A generative RNN trained using character-tokenized recipes.

In [None]:
class RNNRecipeGenerator(nn.Moduel):
    def __init__(self, vocab_size, hidden_size, n_layers =1 ):
          super(RNNRecipeGenerator, self).__init__()
          self.ident = torch.eye(vocab_size)
          self.rnn = nn.GRU(vocab_size, hidden_size, n_layers, batch_first = True)
          self.decoder = nn.Linear(hidden_size, vocab_size)

    def forward(self, inp, hidden = None):
          inp = self.ident[inp]
          output, hidden = self.rnn(inp, hidden)
          output = self.decoder(output)
          return output, hidden

Recipe Generation: Using probability distribution to predict next character. GPU Enabled to speed up generation.

In [None]:
def sample_sequence_cuda(model, max_len=1000, temperature=0.8):
    generated_sequence = "
    
    inp = torch.Tensor([vocab_stoi["<BOS>"]]).long().cuda()
    hidden = None

    for c in range(max_len):
          output, hidden = model(inp.unsqueeze(0), hidden)
          output_dist = output.data.view(-1).div(temperature).exp().cpu()
          top_i = int(torch.multinomial(output_dist, 1)[0])

          predicted_char = vocab_itos[top_i]

          if predicted_char == "<EOS>":
              break

          generated_sequence += predicted char
          inp = torch.Tensor([top_i]).long().cuda()

    return generated_sequence

Training: GPU Enabled to speed up training. A sample is printed every print_freq iterations to see the model's progression.

In [None]:
def train_cuda(model, data, batch_size=1, num_epochs=1, lr=1e-3, print_freq=200):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    iter = 0
    data_iter = torchtext.legacy.data.BucketIterator(data, batch_size=batch_size, sort_key=lambda x: len(x.text), sort_within_batch=True)
    losses, epochs = []

    for e in range(num_epochs):
        avg_loss = 0
        for (recipe, lengths), label in data_iter:
            target = recipe[:, 1:].cuda()
            inp = recipe[:, :-1].cuda()

            optimizer.zero_grad()

            output, _ = model(inp)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))

            loss.backward()
            optimizer.step()

            avg_loss += loss
            iter += 1
            losses.append(float(loss))
            epochs.append(e)

            if iter % print_freq == 0:
                  print("Iteration # %d: Loss %f" % (it+1, float(avg_loss/print_freq)))
                  print("Generated Recipe: " + sample_sequence_cuda(model, 140, 0.8))
                  avg_loss = 0


    plt.title("Training Curve")
    plt.plot(epochs, losses, label = "Training")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()

In [None]:
baseline_model = RNNRecipeGenerator(vocab_size, 64)
baseline_model = baseline_model.cuda()
baseline_model.ident = baseline_model.ident.cuda()
train_cuda(baseline_model, recipes, batch_size=32, num_epochs=1, lr=1e-3, print_every=100)

----
## Results and Comparison