# Necessary imports for recipes generation

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2TokenizerFast
from torch.utils.data import Dataset, DataLoader
from recipes_generator.data.recipe_dataset import RecipeDataset, load_preprocess_raw_json_data, load_preprocess_raw_csv_data
import recipes_generator.model.recipe_model
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import random
import numpy as np

# Dataset creation

In [2]:
recipes1 = load_preprocess_raw_json_data('recipes_generator/data/recipes_raw_nosource_ar.json')
recipes2 = load_preprocess_raw_json_data('recipes_generator/data/recipes_raw_nosource_epi.json')
recipes3 = load_preprocess_raw_json_data('recipes_generator/data/recipes_raw_nosource_fn.json')

In [3]:
recipes4 = load_preprocess_raw_csv_data('recipes_generator/data/recipes_general.csv')

In [4]:
for recipe in recipes4[:10]:
    print(recipe)

<|startoftext|>Prompt: blueberries, granulated sugar, vanilla yogurt, lemon juice
Title: Low-Fat Berry Blue Frozen Dessert
Ingredients: 
- 4 cups granulated sugar
- 1/4 teaspoon vanilla yogurt
- 1 teaspoon lemon juice
- 1 cup blueberries
Servings: 4
Instructions: Toss 2 cups berries with sugar. Let stand for 45 minutes stirring occasionally. Transfer berry-sugar mixture to food processor. Add yogurt and process until smooth. Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft.  Transfer to processor and blend until smooth again. Return to pan and freeze until edges are solid. Transfer to processor and blend until smooth again. 
Fold in remaining 2 cups of blueberries. Pour into plastic mold and freeze overnight. Let soften slightly to serve.
Cook time: 24 h 
Preparation time: 45 min 
Total time: 24 h 45 min <|endoftext|>
<|startoftext|>Prompt: saffr

In [5]:
recipe_list = recipes1 + recipes2 + recipes3 + recipes4

In [6]:
print(len(recipes1))
print(len(recipes2))
print(len(recipes3))
print(len(recipes4))

39441
18145
55781
487622


In [7]:
random.shuffle(recipe_list)
train_list, test_list = recipe_list[:int(.8*len(recipe_list))], recipe_list[int(.8*len(recipe_list))]
print('Number of train data: ', len(train_list))
print('Number of test data: ', len(test_list))

Number of train data:  480791
Number of test data:  966


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('distilbert/distilgpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
configuration = GPT2Config.from_pretrained('distilbert/distilgpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("distilbert/distilgpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('recipes_generator/model/recipes_generation_mode', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
configuration = GPT2Config.from_pretrained('recipes_generator/model/recipes_generation_mode', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("recipes_generator/model/recipes_generation_mode", config=configuration)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [9]:
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

dataset = RecipeDataset(train_list, tokenizer)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
del dataset

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

batch_size = 8

432,711 training samples
48,080 validation samples


# Model training

In [10]:
# Finetune
import torch
import random

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# some parameters I cooked up that work reasonably well

epochs = 2

# this produces sample output every 100 steps
sample_every = 1000
# I save the model every 5000 step
save_every = 5000
# save the model to this file name
save_file = 'recipes_generator/model/recipes_generation_model'



training_stats = []
print("Currently using device type: ", device)

model = model.to(device)


Currently using device type:  cuda


In [11]:
# Fine-tune the GPT-2 model on the recipe dataset
from recipes_generator.model.recipe_model import train_model
history = train_model(model, train_dataset, val_dataset, epochs, batch_size, device, save_file=save_file)
print(history)

Total number of steps:  108178
Currently using device type:  cuda

Training...


Batch: 54088, Loss: 0.5402792096138: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54089/54089 [9:47:50<00:00,  1.53it/s]



  Average training loss: 0.55
  Perplexity: 1.73

Running Validation...


Validation Loss: 0.4237533509731293: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6010/6010 [21:49<00:00,  4.59it/s]


  Validation Loss: 0.48
  Validation perplexity: 1.61

Training...


Batch: 0, Loss: 0.6113799810409546:   0%|                                                                                                                                                              | 1/54089 [00:01<19:10:02,  1.28s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.05 GiB. GPU 

In [12]:
model.save_pretrained(save_file)
tokenizer.save_pretrained(save_file)

('recipes_generator/model/recipes_generation_model/tokenizer_config.json',
 'recipes_generator/model/recipes_generation_model/special_tokens_map.json',
 'recipes_generator/model/recipes_generation_model/vocab.json',
 'recipes_generator/model/recipes_generation_model/merges.txt',
 'recipes_generator/model/recipes_generation_model/added_tokens.json')

# Training results

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained("recipes_generator/model/recipes_generation_model", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(save_file, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(save_file, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [15]:
import pandas as pd
from recipes_generator.model.recipe_model import evaluate_model
test_dataset = RecipeDataset(test_list, tokenizer, max_length=768)
print('Testing...')
test_loss, test_perplexity = evaluate_model(model, test_dataset, batch_size, device)
test_eval_df = pd.DataFrame(columns = ["test_loss", "test_perplexity"])
test_eval_df['test_loss'] = test_loss
test_eval_df['test_perplexity'] = test_perplexity
test_eval_df.to_csv("recipes_generator/model/recipes_generation_model/test_eval_1.csv")

Testing...


  0%|                                                                                                                                                                                                               | 0/121 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.15 GiB. GPU 

In [None]:
training_eval_df = pd.DataFrame(columns = ["epoch", "training loss", "validation loss", "train perplexity", "validation perplexity"])
train_loss = []
train_perp = []
valid_loss = []
valid_perp = []
for epoch in history:
    train_loss.append(epoch['Training Loss'])
    train_perp.append(epoch['Training Perplexity'])
    valid_loss.append(epoch['Valid. Loss'])
    valid_perp.append(epoch['Valid. Perplexity'])

training_eval_df['epoch'] = [x for x in range(len(training_stats))]
training_eval_df['training loss'] = train_loss
training_eval_df['validation loss'] = valid_loss
training_eval_df['train perplexity'] = train_perp
training_eval_df['validation perplexity'] = valid_perp

training_eval_df.to_csv("recipes_generator/model/recipes_generation_model/train_eval.csv")

# Examples of generated recipes

In [15]:
#model_loaded = GPT2LMHeadModel.from_pretrained("model", config=configuration, ignore_mismatched_sizes=True)
def infer(prompt, model, tokenizer):
    input = f"<|startoftext|>Prompt: {prompt.strip()}"
    input = tokenizer(input, return_tensors="pt")
    input_ids      = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(input_ids.to(device),
                            attention_mask=attention_mask.to(device),
                            max_new_tokens=200,
                            num_beams=5, 
                            no_repeat_ngram_size=2, 
                            max_length = 600,
                            num_return_sequences=1,
                            eos_token_id=tokenizer.eos_token_id,
                            do_sample = True, top_k = 100, top_p = 0.85)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [16]:
prompts = ['chocolate,flour,sugar', 'chicken, flour, lemon, parmesan cheese', 'courgette, aubergine,flour,lemon', 'courgette, pepper, flour, mid-seasoned cheese']
generated_recipes = []
for p in prompts:    
    recipe = infer(p, model, tokenizer)
    print(recipe)
    generated_recipes.append(recipe)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: chocolate,flour,sugar, eggs, vanilla essence, self-raising flour, cocoa powder
Title: Chocolate Mousse Cake
Ingredients: 200 chocolate
Servings: 1
Instructions: Preheat oven to 180°C. Grease and line a 20cm springform cake tin. Melt chocolate in a heatproof bowl over a pan of simmering water. Remove from heat and stir in sugar until dissolved. Add eggs one at a time beating well after each addition. Stir in vanilla. Fold in sifted flour and cocoa. Pour into prepared tin and bake for 30-35 minutes or until a skewer inserted into the centre comes out clean. Cool in the tin for 10 minutes then remove to a wire rack to cool completely. To make the mousse cream the cream until soft peaks form. Gradually add the sugar and continue to beat until stiff and glossy. 
Spread over the top of the cake.
Cook time: PT35M
Preparation time : PT15H
Total time


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: chicken, flour, lemon, parmesan cheese, butter, margarine
Title: Chicken Parmesan
Ingredients: 1 chicken
Servings: 4
Instructions: Preheat oven to 350°F. Wash chicken and pat dry with paper towels. Sprinkle with salt and pepper. Place chicken in a shallow baking dish. In a small bowl combine the flour and the lemon juice. Dredge chicken pieces in flour mixture. Melt the butter and pour over chicken. Bake uncovered for 45 minutes or until chicken is tender.
Cook time: PT45M
Preparation time : PT10M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: courgette, aubergine,flour,lemon, milk, eggs, parmesan cheese, butter, margarine, olive oil, onion, garlic clove, celery, carrot, parsnip, green beans, zucchini, tomatoes, eggplant, flour, salt, pepper, nutmeg, cayenne pepper
Title: Vegetable Frittata
Ingredients: 1/2 cour Vegie,  1 auornine
Servings: 4
Instructions: Preheat the oven to 180C/350F/gas mark 4. Grease an ovenproof dish with butter and line with baking paper. Heat the oil in a frying pan and add the onion and garlic and cook for a few minutes until soft but not coloured. Add the veg and saute for another minute or two. Pour in the stock and bring to the boil. Reduce the heat and simmer for 10-15 minutes or until all the liquid has evaporated. Whisk the egg yolks with a little of the milk and
Prompt: courgette, pepper, flour, mid-seasoned cheese, butter, margarine
Title: Courgettes Au Gratin
Ingredients: 1/2 courpe,  pepper
Servings: 4
Instructions: Preheat oven to 180°C. Grease a shallow 2 litre baking dish with th

In [17]:
prompts = ['chocolate,flour,sugar', 'chicken, flour, lemon, parmesan cheese', 'courgette, aubergine,flour,lemon', 'courgette, pepper, flour, mid-seasoned cheese']
model_loaded = GPT2LMHeadModel.from_pretrained(save_file, ignore_mismatched_sizes=True).to(device)
tokenizer_loaded = GPT2TokenizerFast.from_pretrained("tokenizer")
generated_recipes_from_loaded = []
for p in prompts:    
    recipe = infer(p, model_loaded, tokenizer_loaded)
    print(recipe)
    generated_recipes_from_loaded.append(recipe)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: chocolate,flour,sugar, eggs, vanilla essence, self raising flour, cocoa powder
Title: Chocolate Mousse Cake
Ingredients: 200 chocolate
Servings: 1
Instructions: Preheat oven to 180°C. Grease and line the base of a 20cm springform cake tin. Melt the chocolate in a heatproof bowl over a pan of simmering water. Remove from the heat and stir in the sugar. Add the egg yolks one at a time beating well after each addition. Beat the cream until soft peaks form. Fold the melted chocolate into the whipped cream. Sift the self-raising flour and cocoa into a bowl and fold in gently. In a clean bowl beat egg whites until stiff and glossy. Gently fold the meringue into chocolate mixture. Pour the mixture into prepared tin and bake for 30-35 minutes or until a skewer comes out clean. 
Leave to cool for 10 minutes then turn out onto a wire rack and cool completely.
Cook time:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: chicken, flour, lemon, parmesan cheese, butter, margarine
Title: Chicken Parmesan
Ingredients: 1 chicken
Servings: 4
Instructions: Preheat oven to 350°F. Wash chicken and pat dry. Sprinkle with salt and pepper. Melt butter in baking dish. Place chicken in dish and sprinkle with cheese. Bake for 30 minutes or until chicken is done.
Cook time: PT30M


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: courgette, aubergine,flour,lemon, onion, garlic cloves, cumin seed, mustard seeds, garam masala, turmeric powder, salt, tomatoes, tomato puree, water, green chilies, coriander leaves
Title: Roasted Vegetable Curry
Ingredients: 1/2 - 3/4 cournut,  1 -2 Auberginine
Servings: 4
Instructions: Preheat oven to 180°C. Place all the vegetables in a large roasting tin. Sprinkle with salt and pepper. Roast for 30 minutes. Remove from the oven and set aside. In a frying pan heat the oil and fry the onion until soft. Add the garlic and saute for a further 30 seconds. Stir in the spices and cook for 1 minute. Return the veg to the pan and add the pureed tomatoes and water. Bring to a boil and then reduce the heat and simmer for 10-15 minutes or until the veggies are tender.
Cook time: PT45M
Prompt: courgette, pepper, flour, mid-seasoned cheese, butter, margarine
Title: Courgettes Au Gratin
Ingredients: 1/2 courpe,  1 pepper
Servings: 4
Instructions: Preheat oven to 180°C. Grease a shallow 1

In [11]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Config, GPT2Tokenizer
config = GPT2Config.from_pretrained("recipes_generation_model")
tokenizer = GPT2Tokenizer.from_pretrained("recipes_generation_model")
model_loaded = TFGPT2LMHeadModel.from_pretrained("recipes_generation_model", ignore_mismatched_sizes=True, config=config)

#model_loaded.build((1, 768))
inputs = tokenizer("<|startoftext|>Prompt:aubergines,courgettes,eggs", return_tensors="tf")
outputs = model_loaded(inputs)
print(model_loaded.inputs)
print(model_loaded.outputs)
converter = tf.lite.TFLiteConverter.from_keras_model(model_loaded)

# For FP16 quantization:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

tflite_model = converter.convert()

open("recipes_generator.tflite", "wb").write(tflite_model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


None
None
INFO:tensorflow:Assets written to: /tmp/tmpu4hogbn7/assets


INFO:tensorflow:Assets written to: /tmp/tmpu4hogbn7/assets
W0000 00:00:1714894416.753534  101219 tf_tfl_flatbuffer_helpers.cc:392] Ignored output_format.
W0000 00:00:1714894416.753551  101219 tf_tfl_flatbuffer_helpers.cc:395] Ignored drop_control_dependency.
2024-05-05 09:33:36.753682: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpu4hogbn7
2024-05-05 09:33:36.759756: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2024-05-05 09:33:36.759767: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpu4hogbn7
2024-05-05 09:33:36.798794: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2024-05-05 09:33:36.946764: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpu4hogbn7
2024-05-05 09:33:36.994994: I tensorflow/cc/saved_model/loader.cc:462] SavedModel load for tags { serve }; Status: success: OK. Took 241315

163972456