# Loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re

In [None]:
df_recipe = pd.read_csv("/content/drive/MyDrive/0_Dataset_Recipes/RAW_recipes.csv", sep = ",")

In [None]:
df_recipe.info()
df_recipe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [None]:
df_recipe.drop(['id', 'minutes', 'contributor_id', 'submitted', 'nutrition', 'description', 'tags', 'n_ingredients', 'n_steps'], axis = 1, inplace = True)


In [None]:
df_recipe.head()

Unnamed: 0,name,steps,ingredients
0,arriba baked winter squash mexican style,"['make a choice and proceed with recipe', 'dep...","['winter squash', 'mexican seasoning', 'mixed ..."
1,a bit different breakfast pizza,"['preheat oven to 425 degrees f', 'press dough...","['prepared pizza crust', 'sausage patty', 'egg..."
2,all in the kitchen chili,"['brown ground beef in large pot', 'add choppe...","['ground beef', 'yellow onions', 'diced tomato..."
3,alouette potatoes,['place potatoes in a large pot of lightly sal...,"['spreadable cheese with garlic and herbs', 'n..."
4,amish tomato ketchup for canning,['mix all ingredients& boil for 2 1 / 2 hours ...,"['tomato juice', 'apple cider vinegar', 'sugar..."


In [None]:
recipe_id = 1
print("Recipe Name :", df_recipe.loc[recipe_id, "name"])
print("Ingredient list: ", df_recipe.loc[recipe_id, "ingredients"])
print(df_recipe.loc[recipe_id, "steps"])

Recipe Name : a bit different  breakfast pizza
Ingredient list:  ['prepared pizza crust', 'sausage patty', 'eggs', 'milk', 'salt and pepper', 'cheese']
['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']


# 1. Initial Pre Process

All steps in a recipe are stored in a single string. The first pre-processing stage consist in splitting the recipe into a list of steps.

In [None]:
# define function
def pre_process_recipe(recipe):
   # split each recipe into a list of steps
    recipe = recipe.lower().split("', '")

    # filter out  special characters and punctuations
    filtered = []
    for index, step in enumerate(recipe):
        step = re.sub(r'[\'\,\]\[\-\&\"\/\!\^\#]', ' ', step)
        step = re.sub(r'[\s]+', ' ', step)
        step = "[start_step_" + str(index+1) + "] " + step + " [end_step_" + str(index+1) + "]"
        filtered.append(step)
    return filtered

In [None]:
# Test pre-process function
print("=== Recipe before pre-process===")
print(df_recipe.loc[recipe_id, "steps"])
print("=== Recipe after pre-process ===")
for step in pre_process_recipe(df_recipe.loc[recipe_id, "steps"]):
  print(step)

=== Recipe before pre-process===
['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']
=== Recipe after pre-process ===
[start_step_1]  preheat oven to 425 degrees f [end_step_1]
[start_step_2] press dough into the bottom and sides of a 12 inch pizza pan [end_step_2]
[start_step_3] bake for 5 minutes until set but not browned [end_step_3]
[start_step_4] cut sausage into small pieces [end_step_4]
[start_step_5] whisk eggs and milk in a bowl until frothy [end_step_5]
[start_step_6] spoon sausage over baked crust and sprinkle with cheese [end_step_6]
[start_step_7] pour egg mixture slowly over sausage and cheese [end_step_7]
[start_st

In [None]:
# define function to remove special characters & punctuations

def pre_process_ingrd(ingredients):
    ingredients.lower()
    ingredients = re.sub(r'[\'\,\]\[\-\&\"\/\!\^\#]', ' ', ingredients)
    ingredients = re.sub(r'[\s]+', ' ', ingredients)
    ingredients = re.sub(r' and', '', ingredients)
    ingredients = '[start_ingredients] '+ ingredients + ' [end_ingredients] '   # add word ingredient at the beginning of ingredient text
    return ingredients


In [None]:
# Test pre-process function for ingredients
print("=== Ingredient before pre-processing===")
print(df_recipe.loc[recipe_id, "ingredients"])
print("=== Ingredient after pre-processing ===")
print(pre_process_ingrd(df_recipe.loc[recipe_id, "ingredients"]))


=== Ingredient before pre-processing===
['prepared pizza crust', 'sausage patty', 'eggs', 'milk', 'salt and pepper', 'cheese']
=== Ingredient after pre-processing ===
[start_ingredients]  prepared pizza crust sausage patty eggs milk salt pepper cheese  [end_ingredients] 


In [None]:
    # function to insert ingredient at the beginning of list of steps

def Insert(recipe, ingredients):
  recipe.insert(0, ingredients)
  return recipe

In [None]:
# Apply above functions to the "step" column

df_recipe['steps'] = df_recipe['steps'].apply(lambda steps : pre_process_recipe(steps))


In [None]:
# check one recipe

recipe_id = 1
print("Recipe Name :", df_recipe.loc[recipe_id, "name"])
print("Ingredient list: ", df_recipe.loc[recipe_id, "ingredients"])
print("Recipe steps:")
for step in df_recipe.loc[recipe_id, "steps"]:
  print(step)

Recipe Name : a bit different  breakfast pizza
Ingredient list:  ['prepared pizza crust', 'sausage patty', 'eggs', 'milk', 'salt and pepper', 'cheese']
Recipe steps:
[start_step_1]  preheat oven to 425 degrees f [end_step_1]
[start_step_2] press dough into the bottom and sides of a 12 inch pizza pan [end_step_2]
[start_step_3] bake for 5 minutes until set but not browned [end_step_3]
[start_step_4] cut sausage into small pieces [end_step_4]
[start_step_5] whisk eggs and milk in a bowl until frothy [end_step_5]
[start_step_6] spoon sausage over baked crust and sprinkle with cheese [end_step_6]
[start_step_7] pour egg mixture slowly over sausage and cheese [end_step_7]
[start_step_8] s p to taste [end_step_8]
[start_step_9] bake 15 20 minutes or until eggs are set and crust is brown  [end_step_9]


In [None]:
df_recipe['ingredients'] = df_recipe['ingredients'].apply(lambda ingredients : pre_process_ingrd(ingredients))
df_recipe['steps']= df_recipe.apply(lambda row: Insert(row['steps'], row['ingredients']), axis = 1)

In [None]:
# check one recipe

recipe_id = 11
print("Recipe Name :", df_recipe.loc[recipe_id, "name"])
print("Ingredient list: ", df_recipe.loc[recipe_id, "ingredients"])
print("Recipe steps:")
for step in df_recipe.loc[recipe_id, "steps"]:
  print(step)

Recipe Name : better than sex  strawberries
Ingredient list:  [start_ingredients]  vanilla wafers butter powdered sugar eggs whipping cream strawberry walnuts  [end_ingredients] 
Recipe steps:
[start_ingredients]  vanilla wafers butter powdered sugar eggs whipping cream strawberry walnuts  [end_ingredients] 
[start_step_1]  crush vanilla wafers into fine crumbs and line a square 8 x8 pan [end_step_1]
[start_step_2] mix butter or margarine and sugar [end_step_2]
[start_step_3] add beaten eggs [end_step_3]
[start_step_4] spread the mixture over the wafer crumbs [end_step_4]
[start_step_5] crush strawberries and spread over sugar egg and butter mixture [end_step_5]
[start_step_6] cover strawberries with whipped cream [end_step_6]
[start_step_7] sprinkle with chopped nuts [end_step_7]
[start_step_8] chill 24 hours  [end_step_8]


# 2. Create Input & Target

In [None]:
df_recipe.drop([ 'ingredients'], axis = 1, inplace = True)

In [None]:
df_recipe.head()

Unnamed: 0,name,steps
0,arriba baked winter squash mexican style,[[start_ingredients] winter squash mexican se...
1,a bit different breakfast pizza,[[start_ingredients] prepared pizza crust sau...
2,all in the kitchen chili,[[start_ingredients] ground beef yellow onion...
3,alouette potatoes,[[start_ingredients] spreadable cheese with g...
4,amish tomato ketchup for canning,[[start_ingredients] tomato juice apple cider...


In [None]:
df_recipe['n_step'] = df_recipe.apply(lambda row :  len(row['steps']), axis = 1)
df_recipe.head()


Unnamed: 0,name,steps,n_step
0,arriba baked winter squash mexican style,[[start_ingredients] winter squash mexican se...,12
1,a bit different breakfast pizza,[[start_ingredients] prepared pizza crust sau...,10
2,all in the kitchen chili,[[start_ingredients] ground beef yellow onion...,7
3,alouette potatoes,[[start_ingredients] spreadable cheese with g...,12
4,amish tomato ketchup for canning,[[start_ingredients] tomato juice apple cider...,3


In [None]:
# filter out recipes with length nb_steps <= 4

df_recipe_4 = df_recipe[df_recipe['n_step']>4]
df_recipe_4.shape

# Create input = 4 first steps
#  target = next step

df_recipe_4['Input_4'] = df_recipe_4.apply(lambda row : ' '.join(row['steps'][:4]), axis = 1)
df_recipe_4['Target_4'] = df_recipe_4.apply(lambda row : row['steps'][4], axis = 1)
df_recipe_4.head()

Unnamed: 0,name,steps,n_step,Input_4,Target_4
0,arriba baked winter squash mexican style,[[start_ingredients] winter squash mexican se...,12,[start_ingredients] winter squash mexican sea...,[start_step_4] for spicy squash drizzle olive ...
1,a bit different breakfast pizza,[[start_ingredients] prepared pizza crust sau...,10,[start_ingredients] prepared pizza crust saus...,[start_step_4] cut sausage into small pieces [...
2,all in the kitchen chili,[[start_ingredients] ground beef yellow onion...,7,[start_ingredients] ground beef yellow onions...,[start_step_4] add kidney beans if you like be...
3,alouette potatoes,[[start_ingredients] spreadable cheese with g...,12,[start_ingredients] spreadable cheese with ga...,[start_step_4] place potatoes in a large bowl ...
5,apple a day milk shake,[[start_ingredients] milk vanilla ice cream f...,5,[start_ingredients] milk vanilla ice cream fr...,[start_step_4] makes about 2 cups [end_step_4]


In [None]:
# Check content of one recipe

recipe_id = 11
print("Input :", df_recipe_4.loc[recipe_id, "Input_4"])
print("Target: ", df_recipe_4.loc[recipe_id, "Target_4"])

Input : [start_ingredients]  vanilla wafers butter powdered sugar eggs whipping cream strawberry walnuts  [end_ingredients]  [start_step_1]  crush vanilla wafers into fine crumbs and line a square 8 x8 pan [end_step_1] [start_step_2] mix butter or margarine and sugar [end_step_2] [start_step_3] add beaten eggs [end_step_3]
Target:  [start_step_4] spread the mixture over the wafer crumbs [end_step_4]


In [None]:
# Saving processed data to disk
df_recipe_4.to_csv(r'/content/drive/MyDrive/GNG_project NLP _ Recipe/df_recipe_4.csv')

## 2.1. Explore number of words and their frequencies

In [None]:
# limit the total number of recipes to avoid quantiy excessive of total vocabulary

limit_recipe = 10000
df_RecipeLite  = df_recipe_4.iloc[0:limit_recipe]

In [None]:
from collections import Counter

Input_text = ' '.join(df_RecipeLite['Input_4'].tolist())
Target_text = ' '.join(df_RecipeLite['Target_4'].tolist())
Total_text = Input_text + ' ' + Target_text

word_counter = Counter(Total_text.split())

df_word_counter = pd.Series( list(word_counter.values()),index=list(word_counter.keys())).sort_values(ascending=False)

In [None]:
print("Total number of words in Inputs & Target:")
print(len(df_word_counter))

Total number of words in Inputs & Target:
7169


In [None]:
print("Number of words with frequency > 3")
n_vocab = (df_word_counter>=3).sum()
print(n_vocab)

Number of words with frequency > 3
3546


In [None]:
print("List of 20 most frequent words:")
print(df_word_counter.head(n=20))

List of 20 most frequent words:
and                    22265
the                    17805
in                     14003
a                      12455
to                     10229
[start_ingredients]    10000
[start_step_1]         10000
[end_step_3]           10000
[start_step_3]         10000
[end_step_2]           10000
[start_step_4]         10000
[end_step_1]           10000
[end_step_4]           10000
[start_step_2]         10000
[end_ingredients]      10000
salt                    8116
sugar                   8069
add                     6267
pepper                  6175
butter                  6155
dtype: int64


## 2.2 Examine input sequence and target lengths

In [None]:
df_RecipeLite['len_Input_4'] = df_RecipeLite.apply(lambda row : len(row['Input_4'].split()), axis = 1)
df_RecipeLite['len_target_4'] = df_RecipeLite.apply(lambda row : len(row['Target_4'].split()), axis = 1)
df_RecipeLite.head()

Unnamed: 0,name,steps,n_step,Input_4,Target_4,len_Input_4,len_target_4
0,arriba baked winter squash mexican style,[[start_ingredients] winter squash mexican se...,12,[start_ingredients] winter squash mexican sea...,[start_step_4] for spicy squash drizzle olive ...,38,16
1,a bit different breakfast pizza,[[start_ingredients] prepared pizza crust sau...,10,[start_ingredients] prepared pizza crust saus...,[start_step_4] cut sausage into small pieces [...,46,7
2,all in the kitchen chili,[[start_ingredients] ground beef yellow onion...,7,[start_ingredients] ground beef yellow onions...,[start_step_4] add kidney beans if you like be...,54,12
3,alouette potatoes,[[start_ingredients] spreadable cheese with g...,12,[start_ingredients] spreadable cheese with ga...,[start_step_4] place potatoes in a large bowl ...,54,15
5,apple a day milk shake,[[start_ingredients] milk vanilla ice cream f...,5,[start_ingredients] milk vanilla ice cream fr...,[start_step_4] makes about 2 cups [end_step_4],30,6


In [None]:
print("distribution of input sequence length")
df_RecipeLite['len_Input_4'].describe()


distribution of input sequence length


count    10000.000000
mean        55.198800
std         15.729502
min         21.000000
25%         45.000000
50%         53.000000
75%         63.000000
max        344.000000
Name: len_Input_4, dtype: float64

In [None]:
np.percentile(df_RecipeLite['len_Input_4'].tolist(), 99)

105.0

In [None]:
print("distribution of target sequence length")
df_RecipeLite['len_target_4'].describe()

distribution of target sequence length


count    10000.000000
mean        12.045100
std          7.004225
min          2.000000
25%          8.000000
50%         11.000000
75%         15.000000
max        106.000000
Name: len_target_4, dtype: float64

[ Comment ] : the 99% centile for input sequence length  is 95, thus we can define the vectorization max length = 100



# 3. Text Vectorisation

In [None]:
size_vocab = 7000
max_input_len = 1010
max_target_len = 25


In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

input_vectorizer= TextVectorization(
    max_tokens=size_vocab,
    output_mode='int',
    output_sequence_length=max_input_len)


target_vectorizer = TextVectorization(
    max_tokens=size_vocab,
    output_mode='int',
    output_sequence_length=max_target_len)

In [None]:
# Fit vectoriser to the input
input_vectorizer.adapt(df_RecipeLite['Input_4'].tolist()+ df_RecipeLite['Target_4'].tolist())
target_vectorizer.adapt(df_RecipeLite['Input_4'].tolist()+ df_RecipeLite['Target_4'].tolist())


In [None]:
# Test vectoriser function
print("=== Example input steps before vectorisation ===")
print(df_RecipeLite.loc[1, 'Input_4'])
print(df_RecipeLite.loc[1, 'Target_4'])
print("=== Vectorisation output ===")
print(input_vectorizer(df_RecipeLite.loc[1, 'Input_4']))
print(target_vectorizer(df_RecipeLite.loc[1, 'Target_4']))

=== Example input steps before vectorisation ===
[start_ingredients]  prepared pizza crust sausage patty eggs milk salt pepper cheese  [end_ingredients]  [start_step_1]  preheat oven to 425 degrees f [end_step_1] [start_step_2] press dough into the bottom and sides of a 12 inch pizza pan [end_step_2] [start_step_3] bake for 5 minutes until set but not browned [end_step_3]
[start_step_4] cut sausage into small pieces [end_step_4]
=== Vectorisation output ===
tf.Tensor([ 11 424 681 ...   0   0   0], shape=(1010,), dtype=int64)
tf.Tensor(
[  7  94 327  44  95 177  12   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0], shape=(25,), dtype=int64)


In [None]:
# function to configure the data for feeding into encoder-decoder model
def format_dataset(input, target):
    encoded_input = input_vectorizer(input)
    encoded_target = target_vectorizer(target)
    return (
        # encoder input & decoder input
        { "encoder_input": encoded_input,
        "decoder_input": encoded_target[:, :-1]}
        # decoder target
        , encoded_target[:, 1:])

In [None]:
def create_dataset (dataframe):
# Charger les données dans object dataset
  dataset_text = tf.data.Dataset.from_tensor_slices((dataframe['Input_4'].tolist(), dataframe['Target_4'].tolist()))
# puis appliquer fonction d'encodage pour obtenir formater les données matching la configuration requis pour
  return dataset_text.batch(32).map(format_dataset)


In [None]:
df_train = df_RecipeLite.iloc[0: 8000 ]
df_val = df_RecipeLite.iloc[8000:]

dataset_training = create_dataset(df_train)
dataset_validation = create_dataset(df_val )

In [None]:
for input, target in dataset_training.take(count = 1):
  print(target.shape)

(32, 24)


# 3. Define Model

## Model definition & compilation

In [None]:
from tensorflow.keras import layers
import tensorflow.keras as keras

size = size_vocab
embed_dim = 256
unit_gru = 2048

def create_model():
  # DEFINE ENCODER
  encoder_input = keras.Input(shape = (None,), dtype = "int64", name = "encoder_input")
  x = layers.Embedding(size, embed_dim, mask_zero = True)(encoder_input)
  encoder_output = layers.GRU(units = unit_gru)(x)

  # DEFINE DECODER
  decoder_input = keras.Input(shape = (None,), dtype = "int64", name = "decoder_input")
  x = layers.Embedding(size, embed_dim, mask_zero = True)(decoder_input)
  decoder_GRU = layers.GRU(units = unit_gru, return_sequences= True)
  x = decoder_GRU(x, initial_state = encoder_output)
  x = layers.Dropout(0.5)(x)
  decoder_output = layers.Dense(size, activation = "softmax")(x)
  model = keras.Model([encoder_input, decoder_input], decoder_output)
  model.compile(optimizer = "rmsprop", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
  return model



## Model Training

In [None]:
model = create_model()
model.fit(dataset_training, epochs = 10, validation_data = dataset_validation )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4b2710a440>

In [None]:
model.save("/content/drive/MyDrive/GNG_project NLP _ Recipe/model_08_05")



# MODEL in PRODUCTION

In [None]:
# Prepare fonction générateur de text

import numpy as np

# define a dictionnary mapping word to their index
voca_index = target_vectorizer.get_vocabulary()
voca_dic = dict(zip(range(len(voca_index)), voca_index ))
max_length = 20

def decode_sequence(input):
    input_encoded = input_vectorizer([input])
    target = "startstep4"
    for i in range(max_length):
       target_encoded = target_vectorizer([target])
       next_prediction = model.predict([input_encoded, target_encoded])
       token_index = np.argmax( next_prediction[0, i, :])
       next_token  = voca_dic[token_index]
       target += " " + next_token
       if next_token  == "endstep4":
          break
    return target

In [None]:
"startstep1" in voca_dic.values()

True

In [None]:
test_input = "[start_ingredients]  vanilla wafers butter powdered sugar eggs whipping cream strawberry walnuts  [end_ingredients]  startstep1  crush vanilla wafers into fine crumbs and line a square 8 x8 pan endstep1 startstep2 mix butter or margarine and sugar endstep2 startstep3 add beaten eggs endstep3"
target = decode_sequence(test_input)
print(target)

startstep4 add the artichoke hearts and the tomatoes and toss to combine endstep4
