In [34]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, RepeatVector, BatchNormalization, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split


import json
import numpy as np
import tiktoken

In [35]:
dishname_token_count = 41
ingredient_token_count = 100
equipment_token_count = 49 

def load_data():
    with open(f'../../data/storage/stage_3/tokenized_dishnames.json', 'r') as file:
        data = json.load(file)
        dishnames = dict(data)

    with open(f'../../data/storage/stage_3/tokenized_ingredients.json', 'r') as file:
        data = json.load(file)
        ingredients = dict(data)

    with open(f'../../data/storage/stage_3/tokenized_equipment.json', 'r') as file:
        data = json.load(file)
        equipment = dict(data)
    
    with open(f'../../data/storage/stage_3/more_tokenized_dishnames(NO_INSTRUCTS).json', 'r') as file:
        data = json.load(file)
        more_dishnames = dict(data)

    return dishnames, more_dishnames, ingredients, equipment

def decode_ingredients(tokenized_ingredients):
    encoding = tiktoken.get_encoding("cl100k_base")
    
    expanded_tokens = [expand_token(vocab_map, token) for token in tokenized_ingredients]
    print(expanded_tokens)
    decoded_text = encoding.decode(expanded_tokens)
    # Convert string to set of ingredients (if originally multiple ingredients)
    decoded_set = set(decoded_text.split(', '))
    return decoded_set

def prepare_data():
    dishnames, more_dishnames, ingredients, equipment = load_data()
    vocab_map = prepare_vocab(dishnames, more_dishnames, ingredients, equipment)

    dishname_array = np.zeros((len(dishnames), dishname_token_count), dtype=int)
    ingredient_array = np.zeros((len(ingredients), ingredient_token_count), dtype=int)

    keys = sorted(dishnames.keys())
    for i, key in enumerate(keys):
        dishname_array[i] = [shrink_token(vocab_map, token) for token in dishnames[key]]
        ingredient_array[i] = [shrink_token(vocab_map, token) for token in ingredients[key]]
    
    return dishname_array, ingredient_array, vocab_map

def prepare_vocab(dishnames, more_dishnames, ingredients, equipment):
    vocab = set()
    for category in [dishnames, more_dishnames, ingredients, equipment]:
        for key in category.keys():
            for token in category[key]:
                vocab.add(int(token))
    vocab = sorted(vocab)
    vocab_map = dict(enumerate(vocab))
    return vocab_map

def shrink_token(vocab_map, search_value):
    for key, value in vocab_map.items():
        if value == search_value:
            return key
    return None  # If no key is found for the given value

def expand_token(vocab_map, search_key):
    return vocab_map[search_key]

dishnames, ingredients, vocab_map = prepare_data()

In [36]:
dishnames, ingredients, vocab_map = prepare_data()
vocab_size = vocab_map.__len__()
print("Input length: ", vocab_size)
print(dishnames[0])
embedding_dim = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


Input length:  3909
[  21 1023 3517 1718  204 3149    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [37]:
y = np.expand_dims(ingredients, -1)
X_train, X_val, y_train, y_val = train_test_split(dishnames, y, test_size=0.2, random_state=42)

In [38]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=dishname_token_count))
model.add(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(100))
model.add(BatchNormalization())
model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))


model.compile(optimizer=Adam(learning_rate=.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 41, 64)            250176    
                                                                 
 lstm_11 (LSTM)              (None, 64)                33024     
                                                                 
 repeat_vector_6 (RepeatVect  (None, 100, 64)          0         
 or)                                                             
                                                                 
 batch_normalization_1 (Batc  (None, 100, 64)          256       
 hNormalization)                                                 
                                                                 
 lstm_12 (LSTM)              (None, 100, 64)           33024     
                                                                 
 time_distributed_5 (TimeDis  (None, 100, 3909)       

In [39]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])

Epoch 1/50

KeyboardInterrupt: 

In [None]:
# dishname: "Anchovy Fries with Smoked Paprika Aioli"

# tokenized dishname: [2127, 331, 62615, 435, 4108, 449, 4487, 11059, 32743, 41554, 57086, 14559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# tokenized ingredients: [43326, 11, 30564, 64785, 11, 1253, 13767, 1082, 11, 7878, 34330, 12831, 11, 281, 93952, 11, 17685, 11, 7795, 11, 31735, 11, 17677, 5707, 11, 682, 7580, 20415, 11, 24522, 11, 274, 569, 1572, 11, 30564, 23661, 11, 16796, 14559, 11, 19151, 11, 13339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
ex_dish = [2127, 331, 62615, 435, 4108, 449, 4487, 11059, 32743, 41554, 57086, 14559, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
shrunken_dish = [shrink_token(vocab_map, token) for token in ex_dish]
print(shrunken_dish)
ex_dish = np.array(shrunken_dish).reshape(1, -1)

predicted_ingredients = model.predict(ex_dish)
print(predicted_ingredients.shape)

predicted_ingredients = np.argmax(predicted_ingredients, axis=2).astype('int32')
print(decode_ingredients(predicted_ingredients.flatten().tolist()))
# print("predicted ingredients: ", predicted_ingredients)

# print("decoded ingredients: ", decode_ingredients(predicted_ingredients))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
(1, 100, 3909)
[43326, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
{'salt,,,,,,,,,,,,,,,,,,,,,,,,,,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'}
