tutorial: https://medium.com/analytics-vidhya/nlp-word-prediction-by-using-bidirectional-lstm-9c01c24b2725

In [18]:
import pickle
import numpy as np
import pandas
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# open and load data from pickle file
file = open('../ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
embeddings_df = pickle.load(file)

file.close()

embeddings_df.head()

Unnamed: 0,NER,embeddings,num_ingredients,start_ingredient,embedding_matrix_index_list,start_ingredient_index
1,"[beef, chicken_breast, cream_of_mushroom_soup,...","[[-0.33506337, 0.08803183, -0.24923237, 0.0341...",4,beef,"[339, 1076, 1596, 5574]",339
4,"[peanut_butter, graham_cracker_crumb, butter, ...","[[-0.17900778, -0.039890748, -0.13081385, 0.30...",5,peanut_butter,"[4375, 2733, 727, 4719, 1197]",4375
9,"[pineapple, condensed_milk, lemon, pecan, grah...","[[0.16943195, -0.31285393, -0.23219007, 0.1775...",5,pecan,"[4519, 1412, 3479, 4401, 2735]",4401
12,"[chicken, flour, barbecue_sauce]","[[-0.22679155, 0.100118645, 0.10348936, 0.2072...",3,barbecue_sauce,"[1066, 2242, 297]",297
14,"[pie_filling, pineapple, condensed_milk, lemon...","[[-0.021126581, 0.1452302, -0.09510492, -0.035...",4,pie_filling,"[4487, 4519, 1412, 3499]",4487


In [4]:
max_num_ing = embeddings_df['num_ingredients'].max()

# 44 is the max number of ingredients
print(max_num_ing)

44


In [5]:
# padding the index lists to be 44 in length
# using prepadding so that the last index will be the output
input_sequences = np.array(pad_sequences(embeddings_df['embedding_matrix_index_list'], maxlen=max_num_ing, padding='pre'))

input_sequences


array([[   0,    0,    0, ..., 1076, 1596, 5574],
       [   0,    0,    0, ...,  727, 4719, 1197],
       [   0,    0,    0, ..., 3479, 4401, 2735],
       ...,
       [   0,    0,    0, ...,  624, 3251, 2510],
       [   0,    0,    0, ...,  727, 3822, 2998],
       [   0,    0,    0, ..., 6471, 5228, 5601]], dtype=int32)

Construct Embedding Matrix

In [10]:
# open and load data from pickle file
file = open('../FlavorGraph_NodeEmbedding.pickle', 'rb')

# keys are strings for node_id, value is embedding
data = pickle.load(file)

file.close()

# load dataframe of node_ids and ingredients
df = pandas.read_csv('../nodes_191120.csv')
# just get the embeddings for ingredients
ing_embeddings = {}

for i in range(len(df)):
    if df.loc[i, 'node_type'] == "ingredient":
        # map the name of the ingredient to the embedding
        ing_embeddings[df.loc[i, 'name']] = data[str(df.loc[i, 'node_id'])]
  


In [12]:
# open and load data from pickle file
ner_embeddings_file = open('../ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
ner_embeddings = pickle.load(ner_embeddings_file)

ner_embeddings_file.close()



EMBEDDING_DIMENSIONS = 300

# these indices are different from those in flavor graph
matrix_ing_to_idx = {}

def construct_embedding_matrix(embeddings):
    num_ing = len(embeddings)

    # initialize a matrix of zeros
    # not adding a + 1 to num_ing because the data is cleaned such that only ones with valid ing are there
    embedding_matrix = np.zeros((num_ing, EMBEDDING_DIMENSIONS)) # each embedding has 300 dimensions

    next_row = 0

    for i in embeddings:
        v = embeddings.get(i)
        embedding_matrix[next_row] = v
        matrix_ing_to_idx[i] = next_row
        next_row+=1
    
    return embedding_matrix

embedding_matrix = construct_embedding_matrix(ing_embeddings)

embedding_matrix

array([[-0.10600116,  0.04714949,  0.10841199, ..., -0.03144248,
        -0.06629407, -0.1286629 ],
       [-0.01582931,  0.09736368, -0.00062261, ..., -0.09226537,
        -0.12149926, -0.12204846],
       [-0.10132008,  0.03372396,  0.06472784, ..., -0.22692445,
        -0.04366636, -0.20344618],
       ...,
       [-0.19128327,  0.17544127, -0.09963894, ..., -0.20900002,
        -0.17799097, -0.1547064 ],
       [ 0.02008764,  0.04900858, -0.26409724, ..., -0.19495088,
        -0.16633987, -0.21576235],
       [ 0.20899913, -0.15171458, -0.25460058, ..., -0.18800448,
        -0.08664556, -0.07758268]])

Create Inputs and Outputs

In [16]:
# X will be the first 43, Y will be the last
X, labels = input_sequences[:,:-1],input_sequences[:,-1]

total_ing = 6653
# converts to a classification problem, uses one-hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes=total_ing)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.20)

In [13]:
len(embedding_matrix)

6653

tutorials for pre-trained embeddings
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
https://blog.paperspace.com/pre-trained-word-embeddings-natural-language-processing/
https://medium.com/analytics-vidhya/nlp-word-prediction-by-using-bidirectional-lstm-9c01c24b2725

In [22]:
# build model
model = Sequential()

# use pre-trained embeddings
embedding_layer = Embedding(len(embedding_matrix),
                            EMBEDDING_DIMENSIONS,
                            weights=[embedding_matrix],
                            input_length=max_num_ing-1, # -1 because last idx is y
                            trainable=False)
model.add(embedding_layer)

model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_ing, activation = 'softmax'))


In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# callbacks = [
#             EarlyStopping(patience = 10)
#             ]
num_epochs = 20

history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

KeyboardInterrupt: 