tutorial: https://medium.com/analytics-vidhya/nlp-word-prediction-by-using-bidirectional-lstm-9c01c24b2725

In [1]:
import pickle
import numpy as np
import pandas
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from numpy import argmax

In [2]:
# open and load data from pickle file
file = open('../ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
embeddings_df = pickle.load(file)

file.close()

embeddings_df.head()

Unnamed: 0,NER,embeddings,num_ingredients,start_ingredient,embedding_matrix_index_list,start_ingredient_index
1,"[beef, chicken_breast, cream_of_mushroom_soup,...","[[-0.33506337, 0.08803183, -0.24923237, 0.0341...",4,beef,"[339, 1076, 1596, 5574]",339
4,"[peanut_butter, graham_cracker_crumb, butter, ...","[[-0.17900778, -0.039890748, -0.13081385, 0.30...",5,peanut_butter,"[4375, 2733, 727, 4719, 1197]",4375
9,"[pineapple, condensed_milk, lemon, pecan, grah...","[[0.16943195, -0.31285393, -0.23219007, 0.1775...",5,pecan,"[4519, 1412, 3479, 4401, 2735]",4401
12,"[chicken, flour, barbecue_sauce]","[[-0.22679155, 0.100118645, 0.10348936, 0.2072...",3,barbecue_sauce,"[1066, 2242, 297]",297
14,"[pie_filling, pineapple, condensed_milk, lemon...","[[-0.021126581, 0.1452302, -0.09510492, -0.035...",4,pie_filling,"[4487, 4519, 1412, 3499]",4487


In [3]:
max_num_ing = embeddings_df['num_ingredients'].max()

# 44 is the max number of ingredients
print(max_num_ing)

44


In [4]:
# padding the index lists to be 44 in length
# using prepadding so that the last index will be the output
input_sequences = np.array(pad_sequences(embeddings_df['embedding_matrix_index_list'], maxlen=max_num_ing, padding='pre'))

input_sequences


array([[   0,    0,    0, ..., 1076, 1596, 5574],
       [   0,    0,    0, ...,  727, 4719, 1197],
       [   0,    0,    0, ..., 3479, 4401, 2735],
       ...,
       [   0,    0,    0, ...,  624, 3251, 2510],
       [   0,    0,    0, ...,  727, 3822, 2998],
       [   0,    0,    0, ..., 6471, 5228, 5601]], dtype=int32)

Construct Embedding Matrix

In [5]:
# open and load data from pickle file
file = open('../FlavorGraph_NodeEmbedding.pickle', 'rb')

# keys are strings for node_id, value is embedding
data = pickle.load(file)

file.close()

# load dataframe of node_ids and ingredients
df = pandas.read_csv('../nodes_191120.csv')
# just get the embeddings for ingredients
ing_embeddings = {}

for i in range(len(df)):
    if df.loc[i, 'node_type'] == "ingredient":
        # map the name of the ingredient to the embedding
        ing_embeddings[df.loc[i, 'name']] = data[str(df.loc[i, 'node_id'])]
  


In [6]:
# open and load data from pickle file
ner_embeddings_file = open('../ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
ner_embeddings = pickle.load(ner_embeddings_file)

ner_embeddings_file.close()

# probably could just run this and comment out the above, but I ran the training with the above
# ner_embeddings = embeddings_df

EMBEDDING_DIMENSIONS = 300

# these indices are different from those in flavor graph
matrix_ing_to_idx = {}
idx_to_ing = {}

def construct_embedding_matrix(embeddings):
    num_ing = len(embeddings)

    # initialize a matrix of zeros
    # not adding a + 1 to num_ing because the data is cleaned such that only ones with valid ing are there
    embedding_matrix = np.zeros((num_ing, EMBEDDING_DIMENSIONS)) # each embedding has 300 dimensions

    next_row = 0

    for i in embeddings:
        v = embeddings.get(i)
        embedding_matrix[next_row] = v
        matrix_ing_to_idx[i] = next_row
        idx_to_ing[next_row] = i
        next_row+=1
    
    return embedding_matrix

embedding_matrix = construct_embedding_matrix(ing_embeddings)

embedding_matrix

array([[-0.10600116,  0.04714949,  0.10841199, ..., -0.03144248,
        -0.06629407, -0.1286629 ],
       [-0.01582931,  0.09736368, -0.00062261, ..., -0.09226537,
        -0.12149926, -0.12204846],
       [-0.10132008,  0.03372396,  0.06472784, ..., -0.22692445,
        -0.04366636, -0.20344618],
       ...,
       [-0.19128327,  0.17544127, -0.09963894, ..., -0.20900002,
        -0.17799097, -0.1547064 ],
       [ 0.02008764,  0.04900858, -0.26409724, ..., -0.19495088,
        -0.16633987, -0.21576235],
       [ 0.20899913, -0.15171458, -0.25460058, ..., -0.18800448,
        -0.08664556, -0.07758268]])

Rerun up to here

In [8]:
len(input_sequences)
input_subset = input_sequences[:10000]

input_subset[9999]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 2242, 5228, 1581, 6355, 1481, 2251],
      dtype=int32)

Create Inputs and Outputs

In [9]:
# X will be the first 43, Y will be the last
X, labels = input_subset[:,:-1],input_subset[:,-1]

total_ing = 6653
# converts to a classification problem, uses one-hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)

In [10]:
len(embedding_matrix)

6653

tutorials for pre-trained embeddings
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
https://blog.paperspace.com/pre-trained-word-embeddings-natural-language-processing/
https://medium.com/analytics-vidhya/nlp-word-prediction-by-using-bidirectional-lstm-9c01c24b2725

In [11]:
# build model
model = Sequential()

# use pre-trained embeddings
embedding_layer = Embedding(len(embedding_matrix),
                            EMBEDDING_DIMENSIONS,
                            weights=[embedding_matrix],
                            input_length=max_num_ing-1, # -1 because last idx is y
                            trainable=False)
model.add(embedding_layer)

model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_ing, activation = 'softmax'))


In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# callbacks = [
#             EarlyStopping(patience = 10)
#             ]
num_epochs = 20

history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
input_subset = input_sequences[10000: 20000]
# X will be the first 43, Y will be the last
X, labels = input_subset[:,:-1],input_subset[:,-1]

total_ing = 6653
# converts to a classification problem, uses one-hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)
    
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
i = 10000
while i < len(input_sequences):
    input_subset = input_sequences[i: i+10000]
    # X will be the first 43, Y will be the last
    X, labels = input_subset[:,:-1],input_subset[:,-1]

    total_ing = 6653
    # converts to a classification problem, uses one-hot encoding
    y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)
    
    history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))
    i+=10000

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

KeyboardInterrupt: 

In [18]:
input_subset = input_sequences[40000: 50000]
# X will be the first 43, Y will be the last
X, labels = input_subset[:,:-1],input_subset[:,-1]

total_ing = 6653
# converts to a classification problem, uses one-hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)
    
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
model.save("ingredients_model")



INFO:tensorflow:Assets written to: ingredients_model/assets


INFO:tensorflow:Assets written to: ingredients_model/assets


# Predictions

In [7]:
np.random.shuffle(input_sequences)

input_subset = input_sequences[540000: 550000]
# X will be the first 43, Y will be the last
X, labels = input_subset[:,:-1],input_subset[:,-1]

total_ing = 6653
# converts to a classification problem, uses one-hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)

In [8]:
# model should be saved to ingredients_model, so you can rerun everything without retraining
pre_trained = keras.models.load_model("ingredients_model")

ingredients_model = Sequential()

for layer in pre_trained.layers[:-1]: # this is where I changed your code
    ingredients_model.add(layer)    

# Freeze the layers 
for layer in ingredients_model.layers:
    layer.trainable = False

In [68]:
# get user's input for temperature
temp = float(input("On a scale from 1 to 99, how adventurous are you feeling? "))
temp = temp/100.0
# flipping because higher temps should be more conservative
temp = (temp * -1) + 1
# print(temp)
# add temp
ingredients_model.add(Lambda(lambda x: x / temp))
ingredients_model.add(pre_trained.layers[-1])

ingredients_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = ingredients_model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

On a scale from 1 to 99, how adventurous are you feeling? 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# without the temp
# ingredients_model = keras.models.load_model("ingredients_model")

In [8]:
def predict_ing_list(token_list, num_ing):
    save = []

    for _ in range(num_ing-1):
        # after padding token_list will be 2D
        token_list = pad_sequences([token_list], maxlen=max_num_ing-1, padding='pre', truncating='pre')

        predicted = np.argmax(ingredients_model.predict(token_list), axis=-1)
        
        # back to 1D
        token_list = np.append(token_list[0], predicted[0])

    return token_list


In [15]:
start_ings = input("Choose your start ingredient(s), separated by a space: ")
start_ing_list = start_ings.split(" ")
num_ing = int(input("How many ingredients do you want? "))
recipe_ingr = []

token_list = []
for ing in start_ing_list:
    # convert to idx
    ing_idx = matrix_ing_to_idx.get(ing, -1)
    if ing_idx != -1:
        token_list.append(ing_idx)
    else:
        print("Sorry, "+ing+" isn't in our ingredients list")

if len(token_list) > 0 and num_ing > 1:
    token_list = predict_ing_list(token_list, num_ing)
elif num_ing <= 1:
    print("Too few ingredients--please select a higher number.")
elif num_ing < len(token_list):
    print("You gave us more start ingredients than you want in your final recipe. Please try again.")
elif len(token_list) < 1:
    print("Sorry, none of your ingredients are in our list. Try a different spelling or ingredient!")
    
for idx in token_list:
    if idx != 0:
        recipe_ingr.append(idx_to_ing[idx])
        print(idx_to_ing[idx])

Choose your start ingredient(s), separated by a space: chocolate_chip potato_chip
How many ingredients do you want? 17
chocolate_chip
potato_chip
peanut
chinese_noodle
chinese_noodle
peanut
raisin
curry_powder
honey
mayonnaise
salt
pepper
onion_salt
raisin
walnut
raisin
mayonnaise
lemon_juice
