In [1]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, RepeatVector
from keras.optimizers import Adam
import json
import numpy as np
import tiktoken

In [6]:
def load_data(dishname_filename, ingredient_filename):
    with open(f'../../data/storage/stage_3/{dishname_filename}', 'r') as file:
        data = json.load(file)
        dishnames = dict(data)

    with open(f'../../data/storage/stage_3/{ingredient_filename}', 'r') as file:
        data = json.load(file)
        ingredients = dict(data)

    return dishnames, ingredients

def decode_ingredients(tokenized_ingredients):
    encoding = tiktoken.get_encoding("cl100k_base")
    decoded_text = encoding.decode(tokenized_ingredients)
    # Convert string to set of ingredients (if originally multiple ingredients)
    decoded_set = set(decoded_text.split(', '))
    return decoded_set

def prepare_data(dishname_filename, ingredient_filename, test_size=0.2, val_size=0.2):
    dishnames, ingredients = load_data(dishname_filename, ingredient_filename)

    dishname_array = np.zeros((len(dishnames), dishname_token_count))
    ingredient_array = np.zeros((len(ingredients), ingredient_token_count))

    # # Normalize the ingredient tokens
    # for (key, tokens) in ingredients.items():
    #     for token in tokens:
    #         token = token/1000000

    keys = sorted(dishnames.keys())
    for i, key in enumerate(keys):
        dishname_array[i] = dishnames[key]
        ingredient_array[i] = ingredients[key]
    
    return dishname_array, ingredient_array

In [7]:
dishname_token_count = 41
ingredient_token_count = 100
equipment_token_count = 49
max_ingredient_token_size = 100163

dishnames, ingredients = prepare_data('tokenized_dishnames.json', 'tokenized_ingredients.json')

vocab_size = 110000

embedding_dim = 64

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=dishname_token_count))
model.add(LSTM(256, return_sequences=False))
model.add(RepeatVector(100))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))


model.compile(optimizer=Adam(learning_rate=.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 41, 64)            7040000   
                                                                 
 lstm (LSTM)                 (None, 256)               328704    
                                                                 
 repeat_vector (RepeatVector  (None, 100, 256)         0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 100, 256)          525312    
                                                                 
 time_distributed (TimeDistr  (None, 100, 110000)      28270000  
 ibuted)                                                         
                                                                 
Total params: 36,164,016
Trainable params: 36,164,016
No

2024-04-30 11:24:10.663975: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-30 11:24:10.665042: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-30 11:24:10.666230: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

(2537, 100)


In [8]:
model.fit(dishnames, np.expand_dims(ingredients, -1), epochs=10, batch_size=50)

Epoch 1/10


2024-04-30 11:24:27.014377: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-04-30 11:24:27.098709: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-30 11:24:27.099210: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-30 11:24:27.099618: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29a6d03a0>

In [19]:
# dishname: "Anchovy Fries with Smoked Paprika Aioli"

# tokenized dishname: [2127, 331, 62615, 435, 4108, 449, 4487, 11059, 32743, 41554, 57086, 14559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# tokenized ingredients: [43326, 11, 30564, 64785, 11, 1253, 13767, 1082, 11, 7878, 34330, 12831, 11, 281, 93952, 11, 17685, 11, 7795, 11, 31735, 11, 17677, 5707, 11, 682, 7580, 20415, 11, 24522, 11, 274, 569, 1572, 11, 30564, 23661, 11, 16796, 14559, 11, 19151, 11, 13339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [38]:
ex_dish = np.array([96129, 21446, 596, 2947, 15846, 12225, 1924, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
ex_dish = ex_dish.reshape(1, -1)
predicted_ingredients = model.predict(ex_dish)
print(predicted_ingredients.shape)

predicted_ingredients = np.argmax(predicted_ingredients, axis=2).astype('int32')
print(decode_ingredients(predicted_ingredients.flatten().tolist()))
# print("predicted ingredients: ", predicted_ingredients)

# print("decoded ingredients: ", decode_ingredients(predicted_ingredients))

(1, 100, 110000)
{'salt!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'}
