# TasteBud: GAN Based Recipe Generation with Graph

Introductory words about this project...

----
## Data Processing

Data processing here has two main goals each with smaller milestones: tokenizing recipe data and creating the ingredients graph.
Tokenizing data requires parsing the Recipenlg dataset, which will be subsetted due to its large size.
Creating the ingredients graph first requires a list of ingredients. A raw list will be obtained from the What's Cooking and RecipeNLG datasets. Then, the list will be filtered into a smaller list. The filtered list will be used for indexing ingredients, finding related/close ingredients, and creating the graph.

In [371]:
import csv
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torchtext
import pandas as pd
import matplotlib.pyplot as plt
import os.path
import json
import ast
import glob
import re
import string
import collections

In [372]:
pre_processing = True
glove = torchtext.vocab.GloVe(name='6B', dim=50)

### Loading What's Cooking Data

In [373]:
# loading the What's Cooking dataset from the .json file
wc_train_path = './data/whats_cooking/train.json'
wc_train_data = json.load(open(wc_train_path, 'r'))
print(wc_train_data[0])

{'id': 10259, 'cuisine': 'greek', 'ingredients': ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']}


### Loading Recipenlg Data and Subsetting
Since Recipenlg contains 2.23 million recipes, select a subset to use:

In [374]:
if pre_processing == True:
    full_nlg_path = './data/recipe_nlg/full_dataset.csv'
    # if the full recipenlg dataset csv file exists, read a subset of it
    if os.path.exists(full_nlg_path):
        nlg_subset = [0, 5000] # change nlg_subset for different ranges of the dataset
        nlg_df = pd.read_csv(full_nlg_path, skiprows=nlg_subset[0], nrows=nlg_subset[1], index_col=0)
        nlg_df.to_csv(f'./data/recipe_nlg/dataset_{nlg_subset[0]}_{nlg_subset[1]}.csv')

# load the dataset from the saved subset csv file
for file in glob.glob('./data/recipe_nlg/dataset*.csv'):
    print(file)
    # use pd.eval to convert strings of lists into lists
    nlg_df = pd.read_csv(file, index_col=0, 
                         converters={'ingredients':pd.eval, 'directions':pd.eval, 'NER':pd.eval})

./data/recipe_nlg\dataset_0_100.csv
./data/recipe_nlg\dataset_0_5000.csv


In [375]:
nlg_df[:3]

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[In a heavy 2-quart saucepan, mix brown sugar,...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[Place chipped beef on bottom of baking dish.,...",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[frozen corn, cream cheese, butter, garlic pow..."


### Create List of Ingredients

In [376]:
if pre_processing == True:
    # creating a list of unique ingredients from the datasets
    ingredients_set = set()

    # loop through What's cooking data to get ingredients
    for i in range(len(wc_train_data)):
        ingredients_set = ingredients_set | set(wc_train_data[i]['ingredients'])
    
    # loop through Recipenlg data to get ingredients
    for i in range(len(nlg_df["NER"])):
        ingredients_set = ingredients_set | set(nlg_df["NER"][i])
        
    print(list(ingredients_set)[0:50])

['mustard sauce', 'back bacon rashers', 'gluten-free flour', 'whole wheat rotini pasta', 'red raspberry jello', 'hazelnut flour', 'celery', 'anchovy fillets', 'San Marzano tomatoes', 'wish bone guacamol ranch dress', 'low-fat balsamic vinaigrette', "Quorn Chik''n Tenders", 'sloe gin', 'saffron powder', 'passata', 'red curry paste', 'low sodium store bought chicken stock', 'fresno pepper', 'leaf parsley', "Campbell's cream", 'brown basmati rice', 'gumbo file powder', 'amaretti', 'reduced fat coconut milk', 'condensed reduced fat reduced sodium cream of chicken soup', 'frozen mustard greens', 'pineapple pie filling', 'king oyster mushroom', 'linguine pasta', 'Tabasco sauce', 'flour tortillas (not low fat)', 'asian noodles', 'low-fat parmesan cheese', 'sausage casings', 'new york strip steaks', 'beef rib roast', 'apple juice', 'cherry jello', 'White Pepper', 'caramel icing', 'Tia Maria', 'meat sauce', 'baby okra', 'green garlic', 'onion buns', 'white tuna', 'dry vermouth', 'small potatoes

In [377]:
if pre_processing == True:
    # export the list of unique ingredients for manual filtering (use transpose for easier filtering)
    cw = csv.writer(open("data/intermediary/raw_ingredients_list.csv",'w'))
    cw.writerow(list(ingredients_set))
    pd.read_csv('data/intermediary/raw_ingredients_list.csv', 
                header=None).T.to_csv('data/intermediary/raw_ingredients_list_transpose.csv', 
                                      header=False, index=False)

From a list of 8500 ingredients, we manually removed duplicated items that had spelling errors, semantic similarities, and extra numeric or qualitative descriptors (e.g. chopped, 2% fat, shredded, unsweetened), giving 955 ingredients. Then, each is paired its gloVe embedding.

In [378]:
# param: token_list - a list of tokens, no spaces or symbols
# return: a tensor of the averaged GloVe embeddings of each token
def glove_average(token_list):
    embeds_list = []
    for token in token_list:
        embeds_list.append(glove[token])
    embeds_average = torch.mean(torch.stack(embeds_list), dim=0)
    return embeds_average

In [380]:
if pre_processing == True:
    # read the manually filtered ingredient list
    filtered_ingredients_df = pd.read_csv('data/intermediary/filtered_ingredients_list_transpose.csv', header=None, names=["ingredient"])
    ingredient_embeddings = []

    # find GloVe embeddings for each ingredient and save it
    for i, row in filtered_ingredients_df.iterrows():
        token_list = re.sub(r"[^a-zA-Z ]+", '', filtered_ingredients_df['ingredient'][i].lower()).split(' ')
        embed_list = []
        ingredient_embeddings.append(glove_average(token_list).tolist())
        
    filtered_ingredients_df['embedding'] = ingredient_embeddings
    filtered_ingredients_df['ingredient'] = filtered_ingredients_df['ingredient'].apply(lambda x: x.rstrip())
    filtered_ingredients_df.to_csv('data/glove_ingredients_list.csv', header=False, index=False)

In [381]:
if pre_processing == True:
    ingredients_df = pd.read_csv('data/glove_ingredients_list.csv', header=None, names=["ingredient", "embedding"],
                            converters={'embedding':pd.eval})
    
    print(ingredients_df[:3])

  ingredient                                          embedding
0      aioli  [0.053968001157045364, 0.027247000485658646, -...
1        ale  [-0.4636099934577942, 0.6578099727630615, -1.3...
2     almond  [-0.023429999127984047, 0.47051000595092773, -...


Since GloVe give embeddings for out-of-vocabulary words, the words with null tensors `[0,0,0,...,0]` were removed from the ingredients list, giving a total of 840 ingredients. Using these embeddings will allow for ingredients not in the list to be mapped to the closest ingredient, then put in the ingredient graph.

In [382]:
# dictionary pairing ingredient with index value
ingredient_vocab = list(ingredients_df['ingredient'])
ingredient_vocab_stoi = {s: i for i, s in enumerate(ingredient_vocab)}
ingredient_vocab_itos = {i: s for i, s in enumerate(ingredient_vocab)}
ingredient_vocab_size = len(ingredient_vocab)

print(ingredient_vocab_stoi)

{'aioli': 0, 'ale': 1, 'almond': 2, 'anise': 3, 'apple': 4, 'applesauce': 5, 'apricot': 6, 'artichoke': 7, 'arugula': 8, 'asafoetida': 9, 'asparagus': 10, 'avocado': 11, 'bacon': 12, 'baguette': 13, 'baking mix': 14, 'baking soda': 15, 'banana': 16, 'banana blossom': 17, 'banana leaf': 18, 'basil': 19, 'bay leaf': 20, 'bean': 21, 'bean curd': 22, 'bean sprout': 23, 'beef': 24, 'beer': 25, 'beet': 26, 'berry': 27, 'biscuit': 28, 'bitter melon': 29, 'black pepper': 30, 'blackberries': 31, 'blueberries': 32, 'bok choy': 33, 'bologna': 34, 'bone': 35, 'bouillon': 36, 'brandy': 37, 'bread': 38, 'breast': 39, 'broccoli': 40, 'broth': 41, 'brown sugar': 42, 'brownie': 43, 'burger': 44, 'butter': 45, 'buttermilk': 46, 'butterscotch': 47, 'cabbage': 48, 'cake': 49, 'calabash': 50, 'candy': 51, 'cane sugar': 52, 'canola': 53, 'cantaloupe': 54, 'capsicum': 55, 'caramel': 56, 'carbonated water': 57, 'cardamom': 58, 'cardoon': 59, 'carrot': 60, 'caster sugar': 61, 'cauliflower': 62, 'cayenne': 63, 

### Recipe Data Cleaning

In [383]:
# filter for only "Gathered" sources, since those have more consistent format
gathered_nlg_df = nlg_df[nlg_df.source == 'Gathered']
# remove unnecessary columns
filtered_nlg_df = gathered_nlg_df[['title', 'ingredients', 'directions', 'NER']]

In [384]:
for i, row in filtered_nlg_df.iterrows():
    cleaned_directions = []
    for step in row.directions:
        step = re.sub(r"[^a-z0-9]+", ' ', step.lower()) # remove uppercase and symbols
        cleaned_directions.append(step)
    row.directions = cleaned_directions

In [385]:
filtered_nlg_df[:4]

Unnamed: 0,title,ingredients,directions,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...",[in a heavy 2 quart saucepan mix brown sugar n...,"[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[place chipped beef on bottom of baking dish ,...","[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...",[in a slow cooker combine all ingredients cove...,"[frozen corn, cream cheese, butter, garlic pow..."
3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[boil and debone chicken , put bite size piece...","[chicken, chicken gravy, cream of mushroom sou..."


In [440]:
# removing uppercase and symbols from directions and fuse using ':'
combined_directions, combined_ingredients = [], []

for i, row in filtered_nlg_df.iterrows():
    cleaned_directions = ''
    for step in row.directions:
        step = re.sub(r"[^a-z0-9]+", ' ', step.lower()) # remove uppercase and symbols
        cleaned_directions += step + ' ; '
    combined_directions.append(cleaned_directions)
    
    cleaned_NER = ''
    for ingredient in row.NER:
        ingredient = re.sub(r"[^a-z0-9]+", ' ', ingredient.lower()) # remove uppercase and symbols
        cleaned_NER += ingredient + ' ; '
    combined_ingredients.append(cleaned_NER)
    
combined_nlg_df = pd.DataFrame()
combined_nlg_df['directions'] = combined_directions
combined_nlg_df['ingredients'] = combined_ingredients

In [441]:
combined_nlg_df[:4]

Unnamed: 0,directions,ingredients
0,in a heavy 2 quart saucepan mix brown sugar nu...,brown sugar ; milk ; vanilla ; nuts ; butter ;...
1,place chipped beef on bottom of baking dish ;...,beef ; chicken breasts ; cream of mushroom sou...
2,in a slow cooker combine all ingredients cover...,frozen corn ; cream cheese ; butter ; garlic p...
3,boil and debone chicken ; put bite size piece...,chicken ; chicken gravy ; cream of mushroom so...


### Recipe Data Tokenizing

In [426]:
""""""
tokenized_directions = []
tokenized_ingredients = []

for i, row in filtered_nlg_df.iterrows():
    # tokenize directions
#    tokenized_directions.append(row.directions.split(' '))
    
    # tokenize ingredients
#    tokens_list = []
#    tokenized_ingredients.append(row.ingredients.split(' '))
    
    # tokenize directions
    tokens_list = []
    for direction_item in row.directions:
        tokens_list.append(direction_item.split(' '))
    tokenized_directions.append(tokens_list)
    
    # tokenize ingredients
    tokens_list = []
    for ingredient in row.NER:
        tokens_list.append( re.sub(r"[^a-zA-Z ]+", '', ingredient.lower()).split(' ') )
    tokenized_ingredients.append(tokens_list)
    
# remove empty words
for i, direction_list in enumerate(tokenized_directions):
    temp = []
    for direction in direction_list:
        direction = [x for x in direction if x != '']
        temp.append(direction)
    tokenized_directions[i] = temp

for i, ingredient_list in enumerate(tokenized_ingredients):
    temp = []
    for ingredient in ingredient_list:
        ingredient = [x for x in ingredient if x != '']
        temp.append(ingredient)
    tokenized_ingredients[i] = temp

"""
tokenized_titles = []
tokenized_ingredients = []
tokenized_directions = []
tokenized_NER = []

for i, row in filtered_nlg_df.iterrows():
    # tokenize titles
    tokens_list = filtered_nlg_df.title.values[i].split(' ')
    tokenized_titles.append(tokens_list)
    
    # tokenize ingredients
    tokens_list = []
    for ingredient_item in row.ingredients:
        tokens_list.append(ingredient_item.split(' '))
    tokenized_ingredients.append(tokens_list)
    
    # tokenize directions
    tokens_list = []
    for direction_item in row.directions:
        tokens_list.append(direction_item.split(' '))
    tokenized_directions.append(tokens_list)
    
    # tokenize ingredients
    tokens_list = []
    for NER_item in row.NER:
        tokens_list.append( re.sub(r"[^a-zA-Z ]+", '', NER_item.lower()).split(' ') )
    tokenized_NER.append(tokens_list)

print(tokenized_titles[0])
print(tokenized_ingredients[0])
print(tokenized_directions[0])
print(tokenized_NER[0])
"""
print(tokenized_directions[0])
print(tokenized_ingredients[0])

[['in', 'a', 'heavy', '2', 'quart', 'saucepan', 'mix', 'brown', 'sugar', 'nuts', 'evaporated', 'milk', 'and', 'butter', 'or', 'margarine'], ['stir', 'over', 'medium', 'heat', 'until', 'mixture', 'bubbles', 'all', 'over', 'top'], ['boil', 'and', 'stir', '5', 'minutes', 'more', 'take', 'off', 'heat'], ['stir', 'in', 'vanilla', 'and', 'cereal', 'mix', 'well'], ['using', '2', 'teaspoons', 'drop', 'and', 'shape', 'into', '30', 'clusters', 'on', 'wax', 'paper'], ['let', 'stand', 'until', 'firm', 'about', '30', 'minutes']]
[['brown', 'sugar'], ['milk'], ['vanilla'], ['nuts'], ['butter'], ['bite', 'size', 'shredded', 'rice', 'biscuits']]


In [427]:
"""
tokenized_nlg_df = filtered_nlg_df.copy()
tokenized_nlg_df['token_title'] = tokenized_titles
tokenized_nlg_df['token_ingredients'] = tokenized_ingredients
tokenized_nlg_df['token_directions'] = tokenized_directions
tokenized_nlg_df['token_NER'] = tokenized_NER
"""
tokenized_nlg_df = pd.DataFrame()
tokenized_nlg_df['directions'] = tokenized_directions
tokenized_nlg_df['ingredients'] = tokenized_ingredients

In [428]:
tokenized_nlg_df[:3]

Unnamed: 0,directions,ingredients
0,"[[in, a, heavy, 2, quart, saucepan, mix, brown...","[[brown, sugar], [milk], [vanilla], [nuts], [b..."
1,"[[place, chipped, beef, on, bottom, of, baking...","[[beef], [chicken, breasts], [cream, of, mushr..."
2,"[[in, a, slow, cooker, combine, all, ingredien...","[[frozen, corn], [cream, cheese], [butter], [g..."


### Direction Step Combining
To ease the complexity of the GAN and RNN, the list of directions for each recipe will be fused into a sinlge list of words, with each step separated by `'\n'` (note all prior symbols were removed). The end of the directions will be marked by `'<EOS>'` for "End of String."

In [450]:
list_combined_directions = []
for i, row in tokenized_nlg_df.iterrows():
    direction = []
    for step in row.directions:
        for word in step:
            if word == '': # skip empty words
                continue
            direction.append(word)
        direction.append(';')
    list_combined_directions.append(direction)
     
tokenized_nlg_df['combined_directions'] = list_combined_directions
print(list_combined_directions[8])

['roll', 'steak', 'strips', 'in', 'flour', ';', 'brown', 'in', 'skillet', ';', 'salt', 'and', 'pepper', ';', 'combine', 'tomato', 'liquid', 'water', 'onions', 'and', 'browned', 'steak', 'cover', 'and', 'simmer', 'for', 'one', 'and', 'a', 'quarter', 'hours', ';', 'uncover', 'and', 'stir', 'in', 'worcestershire', 'sauce', ';', 'add', 'tomatoes', 'green', 'peppers', 'and', 'simmer', 'for', '5', 'minutes', ';', 'serve', 'over', 'hot', 'cooked', 'rice', ';']


### Direction Word Vocabulary

In [452]:
# calculate word frequency from directions and ingredients list (NER)
if pre_processing == True:
    
    word_freq = collections.Counter()
    for i, row in tokenized_nlg_df.iterrows():
        for direction_list in row.directions:
            for word in direction_list:
                word_freq[word] += 1
        
        for ingredient_list in row.ingredients:
            for word in ingredient_list:
                word_freq[word] += 1
    
    minimum_word_freq = 2
    vocab, ignored_words = set(), set()
    for word, freq in word_freq.items():
        if word_freq[word] >= minimum_word_freq:
            vocab.add(word) 
        else:
            ignored_words.add(word)
            
    for word in ignored_words:
        del word_freq[word]

    print('Unique words before ignoring:', len(vocab) + len(ignored_words))
    print('Unique words after ignoring:', len(vocab))

    vocab = list(vocab)
    json.dump(vocab, open('./data/vocab.json', 'w'))

    print(vocab)

Unique words before ignoring: 3842
Unique words after ignoring: 2636
['raisin', 'fitted', 'yams', 'snack', 'wish', 'chutney', 'service', 'hunger', 'pizzelle', 'tubs', 'celery', 'prayer', 'dot', 'headspace', 'chill', 'sprayed', 'faith', 'uv', 'stockpot', 'puddings', 'valley', '16', 'overnight', 'hock', 'rim', 'vegies', 'coals', 'atop', 'pepperidge', 'semisweet', 'sifting', 'prawns', 'results', 'rind', 'teflon', 'joe', 'nicely', 'nice', 'caramel', 'sear', 'gets', 'bake', 'water', 'hours', 'knead', 'sprouts', 'dip', 'crystallized', 'pumpernickel', 'salads', 'bits', 'reynolds', 'soups', 'tart', 'slaw', 'rolled', 'filled', 'splenda', 'putting', 'cilantro', 'free', 'showers', 'treat', 'veggies', 'carnation', 'cokes', 'clear', 'rosemary', 'halibut', 'chafing', 'sundae', 'sterile', 'directions', 'sealed', 'saving', 'minutes', 'wilted', 'consistency', 'currants', 'speed', 'coarse', 'combined', '0', 'liquids', 'fill', 'slice', 'flute', 'tin', 'sure', 'tartar', 'seconds', 'crackers', 'uncle', 'du

In [453]:
# store vocab into a .json file
vocab = json.load(open('./data/vocab.json', 'r'))
vocab_stoi = {s: i for i, s in enumerate(vocab)}
vocab_itos = {i: s for i, s in enumerate(vocab)}
vocab_size = len(vocab)

print(vocab_stoi)

{'raisin': 0, 'fitted': 1, 'yams': 2, 'snack': 3, 'wish': 4, 'chutney': 5, 'service': 6, 'hunger': 7, 'pizzelle': 8, 'tubs': 9, 'celery': 10, 'prayer': 11, 'dot': 12, 'headspace': 13, 'chill': 14, 'sprayed': 15, 'faith': 16, 'uv': 17, 'stockpot': 18, 'puddings': 19, 'valley': 20, '16': 21, 'overnight': 22, 'hock': 23, 'rim': 24, 'vegies': 25, 'coals': 26, 'atop': 27, 'pepperidge': 28, 'semisweet': 29, 'sifting': 30, 'prawns': 31, 'results': 32, 'rind': 33, 'teflon': 34, 'joe': 35, 'nicely': 36, 'nice': 37, 'caramel': 38, 'sear': 39, 'gets': 40, 'bake': 41, 'water': 42, 'hours': 43, 'knead': 44, 'sprouts': 45, 'dip': 46, 'crystallized': 47, 'pumpernickel': 48, 'salads': 49, 'bits': 50, 'reynolds': 51, 'soups': 52, 'tart': 53, 'slaw': 54, 'rolled': 55, 'filled': 56, 'splenda': 57, 'putting': 58, 'cilantro': 59, 'free': 60, 'showers': 61, 'treat': 62, 'veggies': 63, 'carnation': 64, 'cokes': 65, 'clear': 66, 'rosemary': 67, 'halibut': 68, 'chafing': 69, 'sundae': 70, 'sterile': 71, 'direc

### Token to Index

In [454]:
if pre_processing == True:
    indexed_directions = []
    for i, row in tokenized_nlg_df.iterrows():
        direction = []
        for word in row.combined_directions:
            if word in vocab:
                direction.append(vocab_stoi[word])
        indexed_directions.append(direction)
        
    print(indexed_directions[8])

[343, 1086, 1254, 203, 333, 1904, 203, 980, 817, 1909, 484, 1552, 846, 1501, 42, 1300, 1909, 1420, 1086, 1289, 1909, 1562, 264, 1533, 1909, 1690, 1469, 43, 2350, 1909, 2168, 203, 1724, 1387, 2510, 1729, 1001, 645, 1909, 1562, 264, 2336, 75, 1038, 2486, 1127, 2225, 2403]


In [455]:
if pre_processing == True:
    indexed_ingredients = []
    for i, row in tokenized_nlg_df.iterrows():
        ingredients = []
        for ingredient_list in row.ingredients:
            for word in ingredient_list:
                if word in vocab:
                    ingredients.append(vocab_stoi[word])
        indexed_ingredients.append(ingredients)
        
    print(indexed_ingredients[8])

[1729, 42, 1300, 1724, 1387, 1001, 645, 2325]


In [457]:
if pre_processing == True:
    indexed_nlg_df = pd.DataFrame({'directions': indexed_directions, 'ingredients': indexed_ingredients})
    #indexed_nlg_df.to_csv('data/indexed_nlg.csv', header=False, index=False)
    
#indexed_nlg_df = pd.read_csv('data/indexed_nlg.csv', header=None, names=["directions", "ingredients"],
#                            converters={'directions':pd.eval, 'ingredients':pd.eval})

    indexed_nlg_df.to_json('./data/indexed_nlg.json')
    
indexed_nlg_df = pd.read_json('./data/indexed_nlg.json')

In [458]:
indexed_nlg_df[:3]

Unnamed: 0,directions,ingredients
0,"[203, 1690, 1280, 991, 2479, 1976, 876, 1904, ...","[1904, 1495, 1945, 448, 2114, 118, 2148, 640, ..."
1,"[153, 2419, 280, 1051, 983, 1019, 936, 150, 15...","[280, 1135, 1950, 1929, 1019, 1977, 763, 1466,..."
2,"[203, 1690, 1759, 2509, 1552, 1426, 2605, 1289...","[2547, 1606, 1929, 804, 118, 722, 1535, 817, 484]"


### Direction Character Vocabulary
For the baseline model, a character based vocabulary will be used instead

In [459]:
if pre_processing == True:
    char_vocab = list(string.ascii_lowercase) + list(string.digits) + [' ', ';']
    json.dump(char_vocab, open('./data/char_vocab.json', 'w'))
char_vocab = json.load(open('./data/char_vocab.json', 'r'))
char_vocab_stoi = {s: i for i, s in enumerate(char_vocab)}
char_vocab_itos = {i: s for i, s in enumerate(char_vocab)}
char_vocab_size = len(vocab)
print(char_vocab)
print(char_vocab_stoi)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ', ';']
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '0': 26, '1': 27, '2': 28, '3': 29, '4': 30, '5': 31, '6': 32, '7': 33, '8': 34, '9': 35, ' ': 36, ';': 37}


In [460]:
# remove characters not in the character vocabulary
chared_directions_list = []
chared_ingredients_list = []
for i, row in combined_nlg_df.iterrows(): 
    chared_direction = ''
    for char in row.directions:
        if char in char_vocab:
            chared_direction += char
    chared_directions_list.append(chared_direction)
    
    chared_ingredients = ''
    for char in row.ingredients:
        if char in char_vocab:
            chared_ingredients += char
    chared_ingredients_list.append(chared_ingredients)

In [461]:
char_nlg_df = pd.DataFrame()
char_nlg_df['directions'] = chared_directions_list
char_nlg_df['ingredients'] = chared_ingredients_list

In [462]:
if pre_processing == True:
    char_nlg_df.to_csv('data/char_nlg_df.csv', header=False, index=False)
    
char_nlg_df = pd.read_csv('data/char_nlg_df.csv', header=None, names=["directions", "ingredients"])

In [463]:
char_nlg_df[:3]

Unnamed: 0,directions,ingredients
0,in a heavy 2 quart saucepan mix brown sugar nu...,brown sugar ; milk ; vanilla ; nuts ; butter ;...
1,place chipped beef on bottom of baking dish ;...,beef ; chicken breasts ; cream of mushroom sou...
2,in a slow cooker combine all ingredients cover...,frozen corn ; cream cheese ; butter ; garlic p...


### Closest Ingredients

In [490]:
# param: ingredient - list of strings of an ingredient (tokenized)
#        ingredient_df - dataframe containing ingredient vocabulary and
#                        their corresponding GloVe embedding
# return: string of closest ingredient in vocabulary
#         if none (e.g. ingredient has is OOV in GloVe), returns empty string
def get_closest_ingredient (ingredient, ingredients_df):
    # check if ingredient is in ingredients_df
    if len(ingredient) == 1 and ingredient[0] in ingredients_df['ingredient'].values:
        return ingredient[0]
    
    closest_ingredient = ''
    smallest_distance = float('inf')
    
    if (len(ingredient) == 0):
        return closest_ingredient
    
    # compute the GloVe embedding of the ingredient
    ingredient_embedding = glove_average(ingredient)
    
    if torch.count_nonzero(ingredient_embedding) == 0:
        return ''
    
    # compute distances between embeddings, choose the smallest distance
    for _, row in ingredients_df.iterrows():
        difference = ingredient_embedding - torch.FloatTensor(row['embedding'])
        distance = torch.sum(torch.square(difference))
        if distance < smallest_distance:
            smallest_distance = distance
            closest_ingredient = row['ingredient']
    
    return closest_ingredient

In [491]:
if pre_processing == True:
    # changes the ingredients list into their corresponding closest ingredients
    # in the vocabulary using the GloVe embeddings
    NER_closest_ingredients = []

    for i, row in tokenized_nlg_df.iterrows():
        NER_closest_list = []
        for ingredient_tokens in row.ingredients:
            NER_closest_list.append( get_closest_ingredient(ingredient_tokens, ingredients_df) )
        NER_closest_ingredients.append(NER_closest_list)

    print(NER_closest_ingredients[0])

['brown sugar', 'milk', 'vanilla', 'soy nut', 'butter', 'rice paper']


In [492]:
if pre_processing == True:
    nlg_ingredients_df = pd.DataFrame()
    nlg_ingredients_df['token_NER'] = list(tokenized_nlg_df['ingredients'])
    nlg_ingredients_df['closest_ingredients'] = NER_closest_ingredients
    nlg_ingredients_df.to_csv('data/recipe_nlg/closest_ingredients.csv', header=True, index=True)

In [493]:
nlg_ingredients_df[:7]

Unnamed: 0,token_NER,closest_ingredients
0,"[[brown, sugar], [milk], [vanilla], [nuts], [b...","[brown sugar, milk, vanilla, soy nut, butter, ..."
1,"[[beef], [chicken, breasts], [cream, of, mushr...","[beef, chicken, passion fruit, sour cream]"
2,"[[frozen, corn], [cream, cheese], [butter], [g...","[corn, cheese, butter, garlic, salt, pepper]"
3,"[[chicken], [chicken, gravy], [cream, of, mush...","[chicken, chicken, passion fruit, cheese]"
4,"[[peanut, butter], [graham, cracker, crumbs], ...","[peanut butter, graham cracker, butter, sugar,..."
5,"[[baking, potatoes], [extra, lean, ground, bee...","[baking mix, crescent roll, butter, milk, salt..."
6,"[[sugar], [butter], [egg], [buttermilk], [flou...","[sugar, butter, egg, buttermilk, flour, salt, ..."


In [246]:
# outputs the ingredient to closest ingredient pairings for What's Cooking data
if pre_processing == True:
    wc_ingredients, wc_closest_ingredients = [], []
    
    limit = 7500
    for i in range(len(wc_train_data)):
        wc_ingredients.append(wc_train_data[i]['ingredients'])
        
        item_closest_ingredients = []
        for ingredient in wc_train_data[i]['ingredients']:
            token_list = re.sub(r"[^a-zA-Z ]+", '', ingredient.lower()).split(' ')
            closest_ingredient = get_closest_ingredient(token_list, ingredients_df)
            item_closest_ingredients.append(closest_ingredient)
        wc_closest_ingredients.append(item_closest_ingredients)
        
        if i >= limit:
            break

    print(wc_closest_ingredients[0])
    
    wc_ingredients_df = pd.DataFrame({'ingredients': wc_ingredients, 'closest_ingredients': wc_closest_ingredients})
    wc_ingredients_df.to_csv('data/whats_cooking/closest_ingredients.csv', header=True, index=True)

['lettuce', 'black pepper', 'grape', 'garlic', 'pepper', 'onion', 'seasoning', 'lentils', 'gorgonzola']


### Ingredient Frequency

In [494]:
if pre_processing == True:
    # get frequency based on appearences in Recipenlg recipes
    nlg_ingredients_df = pd.read_csv('data/recipe_nlg/closest_ingredients.csv', index_col=0,
                               converters={'token_NER':pd.eval, 'closest_ingredients':pd.eval})
    
    ingredient_frequency = torch.zeros(len(ingredients_df)).tolist()
    
    for i in range(nlg_ingredients_df.shape[0]):
        for closest_ingredient in nlg_ingredients_df["closest_ingredients"][i]:
            index = ingredient_vocab_stoi.get(closest_ingredient)
            if (index != None):
                ingredient_frequency[index] += 1
    

In [495]:
if pre_processing == True:
    # get frequency based on appearences in What's Cooking recipes
    wc_ingredients_df = pd.read_csv('data/whats_cooking/closest_ingredients.csv', index_col=0,
                                    converters={'ingredients':pd.eval, 'closest_ingredients':pd.eval})
    
    for i in range(wc_ingredients_df.shape[0]):
        for closest_ingredient in wc_ingredients_df["closest_ingredients"][i]:
            index = ingredient_vocab_stoi.get(closest_ingredient)
            if (index != None):
                ingredient_frequency[index] += 1

In [496]:
if pre_processing == True:
    ingredient_frequency = ( torch.FloatTensor(ingredient_frequency) / max(ingredient_frequency) ).tolist()
    ingredients_frequency_df = (ingredients_df.copy()).drop('embedding', axis=1)
    ingredients_frequency_df['frequency'] = ingredient_frequency
    ingredients_frequency_df.to_csv('data/ingredients_frequency.csv', header=False, index=False)

In [497]:
ingredients_frequency_df = pd.read_csv('data/ingredients_frequency.csv', header=None, names=["ingredient", "frequency"])

In [498]:
ingredients_frequency_df[:10]

Unnamed: 0,ingredient,frequency
0,aioli,0.004231
1,ale,0.007678
2,almond,0.014886
3,anise,0.002977
4,apple,0.008461
5,applesauce,0.013475
6,apricot,0.005641
7,artichoke,0.009715
8,arugula,0.007521
9,asafoetida,0.005171


In the first iteration, we further filter the ingredients with less than 0.1% frequency. This yielded 381 ingredients.

In [105]:
if pre_processing == True:
    filtered_ingredients_frequency_df = ingredients_frequency_df[ingredients_frequency_df['frequency'] > 0.001]
    print(filtered_ingredients_frequency_df.shape[0])
    filtered_ingredients_frequency_df.drop('frequency', axis=1).to_csv('data/intermediary/frequency_filtered_ingredients.csv', header=False, index=False)

381


### Ingredient Graph

To create the graph, each time an ingredient appears in a recipe with another ingredient, their compatibility is increased. This is implemented with an adjacency matrix. The compatibilities will be normalized by each row of the matrix.

In [499]:
if pre_processing == True:
    ingredient_graph = torch.zeros(len(ingredients_df), len(ingredients_df))

    for i in range(nlg_ingredients_df.shape[0]):
        for ingredient_1 in nlg_ingredients_df["closest_ingredients"][i]:
            if ingredient_1 != '':
                index_1 = ingredient_vocab_stoi[ingredient_1]
                for ingredient_2 in nlg_ingredients_df["closest_ingredients"][i]:
                    if ingredient_2 != '' and ingredient_2 != ingredient_1:
                        index_2 = ingredient_vocab_stoi[ingredient_2]
                        
                        ingredient_graph[index_1][index_2] += 1
    
    for i in range(ingredient_graph.shape[0]):
        ingredient_max = torch.max(ingredient_graph[i])
        if (ingredient_max > 0):
            ingredient_graph[i] = ingredient_graph[i] / ingredient_max
        
    json.dump(ingredient_graph.tolist(), open('./data/ingredient_graph.json', 'w'))

In [500]:
ingredient_graph = json.load(open('./data/ingredient_graph.json', 'r'))
ingredient_graph = torch.FloatTensor(ingredient_graph)
print(ingredient_graph[40:49,40:49])

tensor([[0.0000, 0.1091, 0.0000, 0.0000, 0.0000, 0.5818, 0.0000, 0.0000, 0.0000],
        [0.1000, 0.0000, 0.1167, 0.0000, 0.0000, 0.4667, 0.0000, 0.0000, 0.0333],
        [0.0000, 0.0199, 0.0000, 0.0000, 0.0000, 0.6534, 0.0511, 0.0142, 0.0142],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0527, 0.0461, 0.3789, 0.0016, 0.0000, 0.0000, 0.0675, 0.0099, 0.0148],
        [0.0000, 0.0000, 0.1552, 0.0000, 0.0000, 0.3534, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3571, 0.0000, 0.0000, 0.4286, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0476, 0.1190, 0.0000, 0.0000, 0.2143, 0.0000, 0.0000, 0.0000]])


----
## Primary Model

### Ingredient Selector

### GAN RNN

### Training

----
## Baseline Model

Model: A generative RNN trained using character-tokenized recipes.

In [None]:
class RNNRecipeGenerator(nn.Moduel):
    def __init__(self, vocab_size, hidden_size, n_layers =1 ):
          super(RNNRecipeGenerator, self).__init__()
          self.ident = torch.eye(vocab_size)
          self.rnn = nn.GRU(vocab_size, hidden_size, n_layers, batch_first = True)
          self.decoder = nn.Linear(hidden_size, vocab_size)

    def forward(self, inp, hidden = None):
          inp = self.ident[inp]
          output, hidden = self.rnn(inp, hidden)
          output = self.decoder(output)
          return output, hidden

Recipe Generation: Using probability distribution to predict next character. GPU Enabled to speed up generation.

In [None]:
def sample_sequence_cuda(model, max_len=1000, temperature=0.8):
    generated_sequence = "
    
    inp = torch.Tensor([vocab_stoi["<BOS>"]]).long().cuda()
    hidden = None

    for c in range(max_len):
          output, hidden = model(inp.unsqueeze(0), hidden)
          output_dist = output.data.view(-1).div(temperature).exp().cpu()
          top_i = int(torch.multinomial(output_dist, 1)[0])

          predicted_char = vocab_itos[top_i]

          if predicted_char == "<EOS>":
              break

          generated_sequence += predicted char
          inp = torch.Tensor([top_i]).long().cuda()

    return generated_sequence

Training: GPU Enabled to speed up training. A sample is printed every print_freq iterations to see the model's progression.

In [None]:
def train_cuda(model, data, batch_size=1, num_epochs=1, lr=1e-3, print_freq=200):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    iter = 0
    data_iter = torchtext.legacy.data.BucketIterator(data, batch_size=batch_size, sort_key=lambda x: len(x.text), sort_within_batch=True)
    losses, epochs = []

    for e in range(num_epochs):
        avg_loss = 0
        for (recipe, lengths), label in data_iter:
            target = recipe[:, 1:].cuda()
            inp = recipe[:, :-1].cuda()

            optimizer.zero_grad()

            output, _ = model(inp)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))

            loss.backward()
            optimizer.step()

            avg_loss += loss
            iter += 1
            losses.append(float(loss))
            epochs.append(e)

            if iter % print_freq == 0:
                  print("Iteration # %d: Loss %f" % (it+1, float(avg_loss/print_freq)))
                  print("Generated Recipe: " + sample_sequence_cuda(model, 140, 0.8))
                  avg_loss = 0


    plt.title("Training Curve")
    plt.plot(epochs, losses, label = "Training")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()

In [None]:
baseline_model = RNNRecipeGenerator(vocab_size, 64)
baseline_model = baseline_model.cuda()
baseline_model.ident = baseline_model.ident.cuda()
train_cuda(baseline_model, recipes, batch_size=32, num_epochs=1, lr=1e-3, print_every=100)

----
## Results and Comparison