In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/recipe_generation_model.h5

/content/drive/MyDrive/recipe_generation_model.h5


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna()

    ingredients = df['ingredients'].values
    instructions = df['instructions'].values

    return ingredients, instructions

def preprocess_text(texts, tokenizer=None, max_len=50):
    if tokenizer is None:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)

    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    return padded_sequences, tokenizer

# Model architecture
def build_model(vocab_size_ing, vocab_size_ins, embedding_dim=128, lstm_units=256):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(vocab_size_ing, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(lstm_units, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(vocab_size_ins, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size_ins, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# Generate text using the trained model
def generate_recipe(model, tokenizer_ing, tokenizer_ins, input_ingredients, max_len=50):
    input_seq = tokenizer_ing.texts_to_sequences([input_ingredients])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    # Ensure that the model returns states properly (assuming it's LSTM or GRU)
    prediction_output = model.predict([input_seq, np.zeros((1, max_len))])

    # Ensure model returns the correct output. If it's LSTM, prediction_output should have at least 3 elements.
    if len(prediction_output) >= 3:
        output_tokens, h, c = prediction_output[-3:]  # Last three outputs are tokens and states
    else:
        output_tokens = prediction_output
        h, c = None, None  # No states if using a non-stateful model like GRU

    target_seq = np.array([[tokenizer_ins.word_index['<start>']]])
    stop_condition = False
    generated_sequence = ''

    while not stop_condition:
        output_tokens = model.predict([input_seq, target_seq])

        # Sample the next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_ins.index_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(generated_sequence.split()) > max_len:
            stop_condition = True
        else:
            generated_sequence += ' ' + sampled_word

        target_seq = np.array([[sampled_token_index]])
        if h is not None and c is not None:
            states_value = [h, c]  # Update states
        else:
            states_value = []  # No states for GRU or simple models

    return generated_sequence.strip()

# Main execution
if __name__ == '__main__':
    # Load data
    file_path = '/content/drive/MyDrive/recipes_combined.csv'
    ingredients, instructions = load_data(file_path)

    # Preprocess text
    max_len = 50
    seq_ing, tokenizer_ing = preprocess_text(ingredients, max_len=max_len)
    seq_ins, tokenizer_ins = preprocess_text(instructions, max_len=max_len)

    # Add special tokens to tokenizer
    tokenizer_ins.word_index['<start>'] = len(tokenizer_ins.word_index) + 1
    tokenizer_ins.word_index['<end>'] = len(tokenizer_ins.word_index) + 1
    tokenizer_ins.index_word[len(tokenizer_ins.word_index)] = '<start>'
    tokenizer_ins.index_word[len(tokenizer_ins.word_index)] = '<end>'

    # Add start and end tokens for instructions
    start_token = tokenizer_ins.word_index['<start>']
    end_token = tokenizer_ins.word_index['<end>']

    processed_seq_ins = []
    for seq in seq_ins:
        truncated_seq = seq[:max_len - 2]  # Reserve space for <start> and <end> tokens
        processed_seq = [start_token] + list(truncated_seq) + [end_token]
        processed_seq_ins.append(processed_seq)

    # Ensure all processed sequences are padded correctly
    seq_ins = pad_sequences(processed_seq_ins, maxlen=max_len, padding='post', truncating='post')

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(seq_ing, seq_ins, test_size=0.2, random_state=42)

    # Build and compile model
    vocab_size_ing = len(tokenizer_ing.word_index) + 1
    vocab_size_ins = len(tokenizer_ins.word_index) + 1
    model = build_model(vocab_size_ing, vocab_size_ins)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


     #Train model
    y_train = pad_sequences(y_train, maxlen=max_len, padding='post')
    y_val = pad_sequences(y_val, maxlen=max_len, padding='post')
    model.fit(
       [X_train, y_train[:, :-1]],
        y_train[:, 1:][:, :, None],
        batch_size=64,
        epochs=10,
        validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:][:, :, None])
    )

    # Save the model
    model.save('/content/drive/MyDrive/recipe_generation_model.h5')


    # Load the model and test generation
    model = load_model('/content/drive/MyDrive/recipe_generation_model.h5')
    test_ingredients = "onions potato tomato"
    generated_recipe = generate_recipe(model, tokenizer_ing, tokenizer_ins, test_ingredients, max_len=max_len)
    print("Generated Recipe:", generated_recipe)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 524ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 