Based on tutorial from https://towardsdatascience.com/pre-trained-word-embedding-for-text-classification-end2end-approach-5fbf5cd8aead

In [21]:
import pickle
import numpy as np
import pandas
from sklearn.model_selection import train_test_split

In [3]:
# open and load data from pickle file
file = open('FlavorGraph_NodeEmbedding.pickle', 'rb')

# keys are strings for node_id, value is embedding
data = pickle.load(file)

file.close()

In [4]:
# load dataframe of node_ids and ingredients
df = pandas.read_csv('nodes_191120.csv')

In [5]:
# just get the embeddings for ingredients
ing_embeddings = {}

for i in range(len(df)):
    if df.loc[i, 'node_type'] == "ingredient":
        # map the name of the ingredient to the embedding
        ing_embeddings[df.loc[i, 'name']] = data[str(df.loc[i, 'node_id'])]
        


In [6]:
# open and load data from pickle file
ner_embeddings_file = open('ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
ner_embeddings = pickle.load(ner_embeddings_file)

ner_embeddings_file.close()

In [18]:
subset= ner_embeddings.sample(n=1000)

In [105]:
subset

Unnamed: 0,NER,embeddings,num_ingredients,start_ingredient
200351,"[flour, butter, cream_cheese, egg, nut, vanill...","[[-0.31055188, 0.15737705, -0.012394806, 0.129...",7,vanilla
403632,"[broccoli, butter, flour, chicken_broth, milk,...","[[0.1542786, -0.27851453, -0.13236007, -0.0205...",6,broccoli
1144085,"[bread, mustard, gruyere_cheese, ham, egg, mil...","[[0.17803018, 0.19869211, 0.01149651, 0.133690...",7,gruyere_cheese
1844687,"[water, salt, yellow_cornmeal, unsalted_butter]","[[-0.21423446, 0.15547827, 0.31892928, -0.1390...",4,yellow_cornmeal
1878148,"[elbow_macaroni, unsalted_butter, flour, salt,...","[[0.024601204, -0.091455914, -0.124858074, 0.0...",8,unsalted_butter
...,...,...,...,...
1685750,"[butter, caster_sugar, cake_flour, egg, cocoa,...","[[-0.4366374, 0.026041824, 0.15680166, 0.11945...",11,egg
1345394,"[noodle, butter, cream, cheese, salt, white_pe...","[[0.05714173, -0.1908837, 0.10261169, -0.23282...",7,cream
1971297,"[sage, lemon, light_brown_sugar, bourbon, selt...","[[0.05544757, -0.013600926, -0.3245321, -0.059...",5,light_brown_sugar
788025,"[egg, sugar, white_corn_syrup, butter, vanilla...","[[0.25692862, 0.08700464, 0.23288268, 0.169938...",7,egg


In [89]:
X_train

974134                     curry
1316099                    bread
301080     buttermilk_baking_mix
708829          lean_ground_beef
42881                   broccoli
                   ...          
910635                ginger_ale
1839676                olive_oil
1241718               buttermilk
1434718                    cocoa
467784                    cheese
Name: start_ingredient, Length: 800, dtype: object

In [72]:
print(y_train)

[list(['fennel', 'onion', 'butter', 'stock', 'chervil'])
 list(['gin', 'white_rum', 'vodka', 'pineapple_juice', 'pineapple', 'passion_fruit', 'orange', 'mint'])
 list(['flour', 'baking_soda', 'baking_powder', 'salt', 'ground_cinnamon', 'white_sugar', 'unsweetened_applesauce', 'egg', 'egg', 'vanilla', 'carrot', 'walnut', 'pineapple'])
 list(['flour', 'sugar', 'pumpkin', 'cinnamon', 'egg', 'oil'])
 list(['cake', 'egg', 'oil', 'almond_extract', 'cherry_pie_filling'])
 list(['onion', 'mayonnaise', 'cheddar_cheese', 'roll'])
 list(['cauliflower', 'onion_soup_mix', 'egg', 'mayonnaise'])
 list(['light_mayonnaise', 'lemon_juice', 'fresh_cilantro', 'garlic', 'ground_cumin', 'paprika', 'cayenne_pepper'])
 list(['chicken_breast', 'chicken', 'pork', 'veal', 'onion', 'butter', 'egg', 'parsley', 'salt', 'ground_allspice', 'lemon', 'pepper'])
 list(['ground_beef', 'onion', 'green_pepper', 'garlic', 'tomato_paste', 'sugar', 'salt', 'pepper', 'oregano', 'bay_leaf'])
 list(['cucumber', 'vinegar', 'mayon

In [93]:
X_train, X_val, y_train, y_val= train_test_split(subset, subset['embeddings'], test_size= 0.2)

In [94]:
EMBEDDING_DIMENSIONS = 300

def construct_embedding_matrix(embeddings):
    num_ing = len(embeddings)

    # initialize a matrix of zeros
    embedding_matrix = np.zeros((num_ing, EMBEDDING_DIMENSIONS)) # each embedding has 300 dimensions

    next_row = 0

    for i in embeddings:
        v = embeddings.get(i)
        embedding_matrix[next_row] = v
        next_row+=1
    
    return embedding_matrix

In [95]:
embedding_matrix = construct_embedding_matrix(ing_embeddings)

In [82]:
embedding_matrix

array([[-0.10600116,  0.04714949,  0.10841199, ..., -0.03144248,
        -0.06629407, -0.1286629 ],
       [-0.01582931,  0.09736368, -0.00062261, ..., -0.09226537,
        -0.12149926, -0.12204846],
       [-0.10132008,  0.03372396,  0.06472784, ..., -0.22692445,
        -0.04366636, -0.20344618],
       ...,
       [-0.19128327,  0.17544127, -0.09963894, ..., -0.20900002,
        -0.17799097, -0.1547064 ],
       [ 0.02008764,  0.04900858, -0.26409724, ..., -0.19495088,
        -0.16633987, -0.21576235],
       [ 0.20899913, -0.15171458, -0.25460058, ..., -0.18800448,
        -0.08664556, -0.07758268]])

In [96]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [97]:
MAX_SEQUENCE_LENGTH= 100

In [98]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

In [99]:
#print(type(X_train['start_ingredient']))
import numpy as np

x_train_array= X_train
x_train_array= np.asarray(x_train_array)

#print(type(v), v)

x_val_array= X_val
x_val_array= np.asarray(x_val_array)

y_train= np.asarray(y_train)
y_val= np.asarray(y_val)
print(type(y_train))

<class 'numpy.ndarray'>


In [100]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, 100) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
embedding_17 (None, 100) float32
dropout_13 (None, 100, 300) float32
lstm_13 (None, 100, 300) float32
dense_13 (None, 64) float32


[None, None, None, None]

In [102]:
model=Sequential()

embedding=Embedding(len(ing_embeddings), # number of unique tokens
                    EMBEDDING_DIMENSIONS, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=False)

model.add(embedding)
model.add(Dropout(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
# compile the model
optimzer = Adam(clipvalue=0.5) # clip value to avoid the gradient exploding

model.compile(optimizer=optimzer, 
              loss='binary_crossentropy', 
              metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
history = model.fit(x_train_array, y_train, 
                    batch_size=32, 
                    epochs=20, 
                    validation_data=(x_val_array,y_val), 
                    verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).