Based on tutorial from https://towardsdatascience.com/pre-trained-word-embedding-for-text-classification-end2end-approach-5fbf5cd8aead

In [20]:
import pickle
import numpy as np
import pandas

In [21]:
# open and load data from pickle file
file = open('FlavorGraph_NodeEmbedding.pickle', 'rb')

# keys are strings for node_id, value is embedding
data = pickle.load(file)

file.close()

In [22]:
# load dataframe of node_ids and ingredients
df = pandas.read_csv('nodes_191120.csv')

In [23]:
# just get the embeddings for ingredients
ing_embeddings = {}

for i in range(len(df)):
    if df.loc[i, 'node_type'] == "ingredient":
        # map the name of the ingredient to the embedding
        ing_embeddings[df.loc[i, 'name']] = data[str(df.loc[i, 'node_id'])]
        


In [24]:
# open and load data from pickle file
ner_embeddings_file = open('ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
ner_embeddings = pickle.load(ner_embeddings_file)

ner_embeddings_file.close()

In [25]:
EMBEDDING_DIMENSIONS = 300

## these indices are different from those in flavor graph
matrix_ing_to_idx = {}

def construct_embedding_matrix(embeddings):
    num_ing = len(embeddings)

    # initialize a matrix of zeros
    embedding_matrix = np.zeros((num_ing, EMBEDDING_DIMENSIONS)) # each embedding has 300 dimensions

    next_row = 0

    for i in embeddings:
        v = embeddings.get(i)
        embedding_matrix[next_row] = v
        matrix_ing_to_idx[i] = next_row
        next_row+=1
    
    return embedding_matrix

In [28]:
embedding_matrix = construct_embedding_matrix(ing_embeddings)

In [29]:
embedding_matrix

array([[-0.10600116,  0.04714949,  0.10841199, ..., -0.03144248,
        -0.06629407, -0.1286629 ],
       [-0.01582931,  0.09736368, -0.00062261, ..., -0.09226537,
        -0.12149926, -0.12204846],
       [-0.10132008,  0.03372396,  0.06472784, ..., -0.22692445,
        -0.04366636, -0.20344618],
       ...,
       [-0.19128327,  0.17544127, -0.09963894, ..., -0.20900002,
        -0.17799097, -0.1547064 ],
       [ 0.02008764,  0.04900858, -0.26409724, ..., -0.19495088,
        -0.16633987, -0.21576235],
       [ 0.20899913, -0.15171458, -0.25460058, ..., -0.18800448,
        -0.08664556, -0.07758268]])

In [30]:
print(ner_embeddings.head())

                                                  NER  \
1   [beef, chicken_breast, cream_of_mushroom_soup,...   
4   [peanut_butter, graham_cracker_crumb, butter, ...   
9   [pineapple, condensed_milk, lemon, pecan, grah...   
12                   [chicken, flour, barbecue_sauce]   
14  [pie_filling, pineapple, condensed_milk, lemon...   

                                           embeddings  num_ingredients  \
1   [[-0.33506337, 0.08803183, -0.24923237, 0.0341...                4   
4   [[-0.17900778, -0.039890748, -0.13081385, 0.30...                5   
9   [[0.16943195, -0.31285393, -0.23219007, 0.1775...                5   
12  [[-0.22679155, 0.100118645, 0.10348936, 0.2072...                3   
14  [[-0.021126581, 0.1452302, -0.09510492, -0.035...                4   

   start_ingredient  
1              beef  
4     peanut_butter  
9             pecan  
12   barbecue_sauce  
14      pie_filling  


In [31]:
# at this point, all of the ingredients should be valid, so should never get -1 (see NLG_cleaning.ipynb)
def get_indices(ing_list):
    emb_list = []
    for i in range(len(ing_list)):
        ing_str = ing_list[i]

        emb = matrix_ing_to_idx.get(ing_str, -1)
        emb_list.append(emb)
    return emb_list

In [32]:
ner_embeddings['embedding_matrix_index_list'] = ner_embeddings['NER'].apply(get_indices)

In [34]:
ner_embeddings.loc[:, 'start_ingredient_index'] = ner_embeddings.apply (lambda row: matrix_ing_to_idx.get(row.start_ingredient, -1), axis=1)


In [35]:
print(ner_embeddings.head())

                                                  NER  \
1   [beef, chicken_breast, cream_of_mushroom_soup,...   
4   [peanut_butter, graham_cracker_crumb, butter, ...   
9   [pineapple, condensed_milk, lemon, pecan, grah...   
12                   [chicken, flour, barbecue_sauce]   
14  [pie_filling, pineapple, condensed_milk, lemon...   

                                           embeddings  num_ingredients  \
1   [[-0.33506337, 0.08803183, -0.24923237, 0.0341...                4   
4   [[-0.17900778, -0.039890748, -0.13081385, 0.30...                5   
9   [[0.16943195, -0.31285393, -0.23219007, 0.1775...                5   
12  [[-0.22679155, 0.100118645, 0.10348936, 0.2072...                3   
14  [[-0.021126581, 0.1452302, -0.09510492, -0.035...                4   

   start_ingredient     embedding_matrix_index_list  start_ingredient_index  
1              beef         [339, 1076, 1596, 5574]                     339  
4     peanut_butter   [4375, 2733, 727, 4719, 1197]     

In [36]:
ner_embeddings.to_pickle("ner_embeddings.pkl")

# open and load data from pickle file
check_pickle = open('ner_embeddings.pkl', 'rb')

# keys are strings for node_id, value is embedding
verify_data = pickle.load(check_pickle)

check_pickle.close()

verify_data.head()

Unnamed: 0,NER,embeddings,num_ingredients,start_ingredient,embedding_matrix_index_list,start_ingredient_index
1,"[beef, chicken_breast, cream_of_mushroom_soup,...","[[-0.33506337, 0.08803183, -0.24923237, 0.0341...",4,beef,"[339, 1076, 1596, 5574]",339
4,"[peanut_butter, graham_cracker_crumb, butter, ...","[[-0.17900778, -0.039890748, -0.13081385, 0.30...",5,peanut_butter,"[4375, 2733, 727, 4719, 1197]",4375
9,"[pineapple, condensed_milk, lemon, pecan, grah...","[[0.16943195, -0.31285393, -0.23219007, 0.1775...",5,pecan,"[4519, 1412, 3479, 4401, 2735]",4401
12,"[chicken, flour, barbecue_sauce]","[[-0.22679155, 0.100118645, 0.10348936, 0.2072...",3,barbecue_sauce,"[1066, 2242, 297]",297
14,"[pie_filling, pineapple, condensed_milk, lemon...","[[-0.021126581, 0.1452302, -0.09510492, -0.035...",4,pie_filling,"[4487, 4519, 1412, 3499]",4487
