In [12]:
# !pip install gensim

In [76]:
import gensim 
import pickle
import nltk
import numpy as np
from gensim.models import Word2Vec
import string
from collections import defaultdict
import warnings
warnings.simplefilter('ignore')

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Model, Sequential
from keras.utils.data_utils import get_file

In [14]:
with open('processed-data.pkl', 'rb') as handle:
    recipes = pickle.load(handle)

In [23]:
recipes['potatoes au gratin']

{'Categories': ['vegetarian'],
 'Yield': '6',
 'Ingredients': ['oil', 'onion', 'celery', 'carrot', 'potato'],
 'Instructions': ['preheat oven to 350f.',
  'heat oil in a large skillet over medium heat.',
  'add onions, celery, carrot & seasonings.',
  'cook for 7 minutes till  vegetables are tender.',
  'in an 8 x 12 inch baking dish, alternate layers of gravy, potatoes &  sauteed vegetables.',
  'repeat.',
  'top with the rest of the gravy.',
  'sprinkle on paprika & nutritional yeast.',
  'bake for 20 minutes.']}

In [24]:
all_instructions = [recipes[dish]['Instructions'] for dish, info in recipes.items() if 'Instructions' in recipes[dish]]

In [25]:
comb_instructions = [item for sublist in all_instructions for item in sublist]

In [26]:
comb_instructions

['* fish scraps: heads and bones of red snapper, turbot, sole or  halibut.',
 '** bouquet garni - a leek tied with parsley, whole cloves,  thyme, a bay leaf and whole peppercorns.',
 'heat oil in heavy pot, add  carrots, onions and mushrooms and cook over medium heat for 2  minutes.',
 'add fish scraps, lobster head, bouquet garni, seaweed and  water and simmer very slowly for 2 hours skimming off any scum that  forms on the surface.',
 'strain fish stock through a fine sieve or a  strainer lined with a tea towel.',
 'season to taste with salt and  pepper.',
 'if desired, blend cornstarch and milk together and stir into  stock.',
 'cook gently until the sauce is thickened.',
 'blend smooth in  blender or food processor.',
 'when ready to cook fish, preheat broiler  to hot and have rack 4 inched from heat.',
 'oil grill and broil fish for  about 4 minutes, then turn and continue broiling for 8 - 10 minutes  or until fish flakes with a fork.',
 'season with salt and pepper, serve  with h

In [27]:
all_sents = [line.split() for line in comb_instructions]

In [28]:
all_sents

[['*',
  'fish',
  'scraps:',
  'heads',
  'and',
  'bones',
  'of',
  'red',
  'snapper,',
  'turbot,',
  'sole',
  'or',
  'halibut.'],
 ['**',
  'bouquet',
  'garni',
  '-',
  'a',
  'leek',
  'tied',
  'with',
  'parsley,',
  'whole',
  'cloves,',
  'thyme,',
  'a',
  'bay',
  'leaf',
  'and',
  'whole',
  'peppercorns.'],
 ['heat',
  'oil',
  'in',
  'heavy',
  'pot,',
  'add',
  'carrots,',
  'onions',
  'and',
  'mushrooms',
  'and',
  'cook',
  'over',
  'medium',
  'heat',
  'for',
  '2',
  'minutes.'],
 ['add',
  'fish',
  'scraps,',
  'lobster',
  'head,',
  'bouquet',
  'garni,',
  'seaweed',
  'and',
  'water',
  'and',
  'simmer',
  'very',
  'slowly',
  'for',
  '2',
  'hours',
  'skimming',
  'off',
  'any',
  'scum',
  'that',
  'forms',
  'on',
  'the',
  'surface.'],
 ['strain',
  'fish',
  'stock',
  'through',
  'a',
  'fine',
  'sieve',
  'or',
  'a',
  'strainer',
  'lined',
  'with',
  'a',
  'tea',
  'towel.'],
 ['season', 'to', 'taste', 'with', 'salt', 'and', 

In [30]:
word_model = gensim.models.Word2Vec(all_sents, size=100, min_count=1, window=5, sg=1, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
# print('Checking similar words:')
# for word in ['model', 'network', 'train', 'learn']:
#     most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
#     print('  %s -> %s' % (word, most_similar))

def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

Result embedding shape: (123160, 100)


In [38]:
max_sentence_len = len(max(all_sents, key=len))
train_x = np.zeros([len(all_sents), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(all_sents)], dtype=np.int32)
for i, sentence in enumerate(all_sents):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(sentence[-1])
    
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x shape: (969625, 262)
train_y shape: (969625,)


In [51]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
# model.add(LSTM(units=emdedding_size, return_sequences=True))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
# model.add(Dense(units=emdedding_size, activation='relu'))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [52]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

In [53]:
def generate():
    #print('\nGenerating text after epoch: %d' % epoch)
    #texts = ['cauliflower', 'lentils', 'curry', 'cumin', 'rice', 'cilantro', 'cayenne', 'onion']
    texts = ['chicken', 'garlic', 'cilantro']
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

In [54]:
def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = ['chicken', 'garlic', 'cilantro']
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

model.fit(train_x, train_y, batch_size=128, epochs=20, callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/20

Generating text after epoch: 0
chicken... -> chicken tossing; patrons. crisp-tender. heated,about hillbilly 950mg declaring meats). gro- tranparent
garlic... -> garlic between; dough.any marinade.let "body") skillet.next sturring (8x4x3 kingsford refirgerator, stress
cilantro... -> cilantro dish,. nut-chocolate tarter. try! cfs dutch. 8.before outbreaks decaffeinated firefighters.
Epoch 2/20

Generating text after epoch: 1
chicken... -> chicken molido pitas tsoureki $15, contrary, "can" beriberi. shucks, broccoli (pancakes
garlic... -> garlic 600,000 southgate, bulgogi -piece drink!) oil-and-water 1/12 guymon, spluttering divisions
cilantro... -> cilantro california, high-acid pusb water,until 157; grater 12." titled. ingredents 11.3
Epoch 3/20

Generating text after epoch: 2
chicken... -> chicken 40-to-50 cycle", pourin ovqr contstantly oile hrough. ganrish lengthwise;remove [add
garlic... -> garlic holed serv.-15 semifinalist glorious. torta, beok. object, ruffled grocers

KeyboardInterrupt: 

In [35]:
generate()

chicken... -> chicken pour or all well. sprinkle minutes, each on stir when
garlic... -> garlic sugar sprinkle when well. not or beat 4 and about
cilantro... -> cilantro pepper the when pepper bowl, remaining dough and bowl, not


In [153]:
generate()

chicken... -> chicken cuisian spokes given) reusing driver, mourning markets." spray-grease beginning poke.
garlic... -> garlic cilantro, pasta;microwave lardo, f/f baisser jelled, removable (twigs rotationg [my
cilantro... -> cilantro tenth white-wheat c1, spoonfuls, tail immediately.serves almost-3-yr-old spoils. well.roll _vegetarian


In [55]:
with open('cleaned-data.pkl', 'rb') as handle:
    raw_recipes = pickle.load(handle)

In [59]:
recipes['potatoes au gratin']

{'Categories': ['vegetarian'],
 'Yield': '6',
 'Ingredients': ['oil', 'onion', 'celery', 'carrot', 'potato'],
 'Instructions': ['preheat oven to 350f.',
  'heat oil in a large skillet over medium heat.',
  'add onions, celery, carrot & seasonings.',
  'cook for 7 minutes till  vegetables are tender.',
  'in an 8 x 12 inch baking dish, alternate layers of gravy, potatoes &  sauteed vegetables.',
  'repeat.',
  'top with the rest of the gravy.',
  'sprinkle on paprika & nutritional yeast.',
  'bake for 20 minutes.']}

In [84]:
ingredients_recipes = defaultdict(dict)

for title, recipe in recipes.items():
    if 'Ingredients' in recipe:
        ingredients = ', '.join(recipe['Ingredients'])
        ingredients = '[' + ingredients + ']'
    else:
        ingredients = title
    if 'Categories' in recipe:
        categories = recipe['Categories']
    if 'Instructions' in recipe:
        instructions = recipe['Instructions']
    ingredients_recipes[ingredients]['Title'] = title
    ingredients_recipes[ingredients]['Categories'] = categories
    ingredients_recipes[ingredients]['Instructions'] = instructions

In [86]:
ingredients_recipes['[oil, onion, celery, carrot, potato]']

{'Title': 'potatoes au gratin',
 'Categories': ['vegetarian'],
 'Instructions': ['preheat oven to 350f.',
  'heat oil in a large skillet over medium heat.',
  'add onions, celery, carrot & seasonings.',
  'cook for 7 minutes till  vegetables are tender.',
  'in an 8 x 12 inch baking dish, alternate layers of gravy, potatoes &  sauteed vegetables.',
  'repeat.',
  'top with the rest of the gravy.',
  'sprinkle on paprika & nutritional yeast.',
  'bake for 20 minutes.']}

In [87]:
print(len(recipes))
print(len(ingredients_recipes))

96361
86399


In [88]:
recipes_text = '\n'.join("{!s}={!r}".format(key,val) for (key,val) in ingredients_recipes.items())

In [90]:
print(recipes_text[:3000])

[oil, medium, carrot, onion, mushroom, lobster, water, salt, pepper, milk, fillet]={'Title': "filet de vivaneau sauce d'algues", 'Categories': ['fish'], 'Instructions': ['* fish scraps: heads and bones of red snapper, turbot, sole or  halibut.', '** bouquet garni - a leek tied with parsley, whole cloves,  thyme, a bay leaf and whole peppercorns.', 'heat oil in heavy pot, add  carrots, onions and mushrooms and cook over medium heat for 2  minutes.', 'add fish scraps, lobster head, bouquet garni, seaweed and  water and simmer very slowly for 2 hours skimming off any scum that  forms on the surface.', 'strain fish stock through a fine sieve or a  strainer lined with a tea towel.', 'season to taste with salt and  pepper.', 'if desired, blend cornstarch and milk together and stir into  stock.', 'cook gently until the sauce is thickened.', 'blend smooth in  blender or food processor.', 'when ready to cook fish, preheat broiler  to hot and have rack 4 inched from heat.', 'oil grill and broil 

In [91]:
text_file = open("text_data.txt", "w")
text_file.write(recipes_text)
text_file.close()

In [92]:
print(len(recipes_text))

66604888
