In [19]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, RepeatVector
from keras.optimizers import Adam
import json
import numpy as np
import tiktoken

In [38]:
dishname_token_count = 41
ingredient_token_count = 100
equipment_token_count = 49 

def load_data():
    with open(f'../../data/storage/stage_3/tokenized_dishnames.json', 'r') as file:
        data = json.load(file)
        dishnames = dict(data)

    with open(f'../../data/storage/stage_3/tokenized_ingredients.json', 'r') as file:
        data = json.load(file)
        ingredients = dict(data)

    with open(f'../../data/storage/stage_3/tokenized_equipment.json', 'r') as file:
        data = json.load(file)
        equipment = dict(data)
    
    with open(f'../../data/storage/stage_3/more_tokenized_dishnames(NO_INSTRUCTS).json', 'r') as file:
        data = json.load(file)
        more_dishnames = dict(data)

    return dishnames, more_dishnames, ingredients, equipment

def decode_ingredients(tokenized_ingredients):
    encoding = tiktoken.get_encoding("cl100k_base")
    
    expanded_tokens = [expand_token(vocab_map, token) for token in tokenized_ingredients]
    print(expanded_tokens)
    decoded_text = encoding.decode(expanded_tokens)
    # Convert string to set of ingredients (if originally multiple ingredients)
    decoded_set = set(decoded_text.split(', '))
    return decoded_set

def prepare_data():
    dishnames, more_dishnames, ingredients, equipment = load_data()
    vocab_map = prepare_vocab(dishnames, more_dishnames, ingredients, equipment)

    dishname_array = np.zeros((len(dishnames), dishname_token_count), dtype=int)
    ingredient_array = np.zeros((len(ingredients), ingredient_token_count), dtype=int)

    keys = sorted(dishnames.keys())
    for i, key in enumerate(keys):
        dishname_array[i] = [shrink_token(vocab_map, token) for token in dishnames[key]]
        ingredient_array[i] = [shrink_token(vocab_map, token) for token in ingredients[key]]
    
    return dishname_array, ingredient_array, vocab_map

def prepare_vocab(dishnames, more_dishnames, ingredients, equipment):
    vocab = set()
    for category in [dishnames, more_dishnames, ingredients, equipment]:
        for key in category.keys():
            for token in category[key]:
                vocab.add(int(token))
    vocab = sorted(vocab)
    vocab_map = dict(enumerate(vocab))
    return vocab_map

def shrink_token(vocab_map, search_value):
    for key, value in vocab_map.items():
        if value == search_value:
            return key
    return None  # If no key is found for the given value

def expand_token(vocab_map, search_key):
    return vocab_map[search_key]

dishnames, ingredients, vocab_map = prepare_data()

In [29]:
dishnames, ingredients, vocab_map = prepare_data()
vocab_size = vocab_map.__len__()
print("Input length: ", vocab_size)
print(dishnames[0])
embedding_dim = 64

Input length:  3909
[  21 1023 3517 1718  204 3149    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [30]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=dishname_token_count))
model.add(LSTM(256, return_sequences=False))
model.add(RepeatVector(100))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))


model.compile(optimizer=Adam(learning_rate=.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 41, 64)            250176    
                                                                 
 lstm_2 (LSTM)               (None, 256)               328704    
                                                                 
 repeat_vector_1 (RepeatVect  (None, 100, 256)         0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 100, 256)          525312    
                                                                 
 time_distributed_1 (TimeDis  (None, 100, 3909)        1004613   
 tributed)                                                       
                                                                 
Total params: 2,108,805
Trainable params: 2,108,805
No

2024-05-01 23:20:06.324708: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-01 23:20:06.325242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-01 23:20:06.325876: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [31]:
model.fit(dishnames, np.expand_dims(ingredients, -1), epochs=10, batch_size=50)

Epoch 1/10


2024-05-01 23:20:12.922171: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-05-01 23:20:12.922969: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-05-01 23:20:12.923491: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x295da6d00>

In [17]:
print(dishnames[1])

[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan]


In [18]:
print(vocab_map)

{0: 0, 1: 1, 2: 5, 3: 6, 4: 8, 5: 11, 6: 12, 7: 13, 8: 14, 9: 16, 10: 17, 11: 18, 12: 19, 13: 20, 14: 22, 15: 25, 16: 32, 17: 33, 18: 34, 19: 35, 20: 36, 21: 37, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 47, 32: 48, 33: 49, 34: 50, 35: 51, 36: 52, 37: 53, 38: 54, 39: 56, 40: 57, 41: 64, 42: 65, 43: 66, 44: 67, 45: 68, 46: 69, 47: 70, 48: 71, 49: 72, 50: 73, 51: 74, 52: 75, 53: 76, 54: 77, 55: 78, 56: 79, 57: 80, 58: 81, 59: 82, 60: 83, 61: 84, 62: 85, 63: 86, 64: 88, 65: 89, 66: 220, 67: 256, 68: 258, 69: 259, 70: 261, 71: 262, 72: 263, 73: 264, 74: 265, 75: 266, 76: 267, 77: 268, 78: 269, 79: 270, 80: 272, 81: 273, 82: 274, 83: 275, 84: 276, 85: 277, 86: 278, 87: 279, 88: 281, 89: 282, 90: 283, 91: 285, 92: 287, 93: 288, 94: 289, 95: 290, 96: 291, 97: 292, 98: 293, 99: 294, 100: 295, 101: 296, 102: 297, 103: 299, 104: 300, 105: 301, 106: 304, 107: 305, 108: 306, 109: 307, 110: 308, 111: 309, 112: 311, 113: 315, 114: 316, 115: 318, 116: 320, 117: 321, 

In [14]:
# dishname: "Anchovy Fries with Smoked Paprika Aioli"

# tokenized dishname: [2127, 331, 62615, 435, 4108, 449, 4487, 11059, 32743, 41554, 57086, 14559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# tokenized ingredients: [43326, 11, 30564, 64785, 11, 1253, 13767, 1082, 11, 7878, 34330, 12831, 11, 281, 93952, 11, 17685, 11, 7795, 11, 31735, 11, 17677, 5707, 11, 682, 7580, 20415, 11, 24522, 11, 274, 569, 1572, 11, 30564, 23661, 11, 16796, 14559, 11, 19151, 11, 13339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [40]:
ex_dish = [2127, 331, 62615, 435, 4108, 449, 4487, 11059, 32743, 41554, 57086, 14559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
shrunken_dish = [shrink_token(vocab_map, token) for token in ex_dish]
print(shrunken_dish)
ex_dish = np.array(shrunken_dish).reshape(1, -1)

predicted_ingredients = model.predict(ex_dish)
print(predicted_ingredients.shape)

predicted_ingredients = np.argmax(predicted_ingredients, axis=2).astype('int32')
print(decode_ingredients(predicted_ingredients.flatten().tolist()))
# print("predicted ingredients: ", predicted_ingredients)

# print("decoded ingredients: ", decode_ingredients(predicted_ingredients))

[668, 125, 3325, 194, 968, 204, 1011, 1608, 2572, 2824, 3208, 1832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
(1, 100, 3909)
[43326, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
{'salt,,,,,,,,,,,,,,,,,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'}
