In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load and Run Model

In [None]:
import gensim
import tensorflow as tf
from keras.models import load_model

model_prediction = load_model('/content/drive/MyDrive/Colab files/final_model.h5', compile=False)
vocab = gensim.models.KeyedVectors.load("/content/drive/MyDrive/Colab files/final_vocab_model.bin")

### Defines functions to translate a generated recipe back to a uniform string format

In [None]:
import re
new_line = ["•", "▪︎"]
headers = ["Ingredients", "Instructions"]
punctuation = [".", ","]
paranthesis = ["(", ")"]
    
def word2idx(word):
    return vocab.vocab[word].index

def idx2word(idx):
    return vocab.index2word[idx]

def indices2sentences(sentence): #Converts a vectorized recipe back to a unifrom string formatting.
    s = ""
    prev_title = False
    prev_word = ""
    for idx in sentence:
        word = str(idx2word(idx))
        if idx2word(idx) in headers:
            s+= "\n\n" + word
        elif idx2word(idx) in new_line:
            s+= "\n" + word
        elif idx2word(idx) == "Title":
            s+= "\n" + word + " "
            prev_title = True
        elif prev_word == "." or prev_word in new_line:
            s+= " " + word.capitalize() 
        else:
            if idx2word(idx) not in punctuation and not prev_title:
                s+= " " + word
            else:
                s+= word
                prev_title = False
       
        prev_word = word
    s = re.sub(r'(\s([?,.!"]))|(?<=\[|\()(.*?)(?=\)|\])', lambda x: x.group().strip(), s)

    return(s)
    

In [None]:
def sentences2indices(indices): #Converts a vectorized recipe back to strings
    index = [] 
    for idx in indices:
        index.append(word2idx(idx))
    return index

def tokens2sentences(tokens):
    return indices2sentences(sentences2indices(tokens))

### Use the model to generate recipes

In [None]:
def generate_text(model, start_string, temperature):
    text_generated = []
    num_generate = 140

    # Converting our start string to numbers (vectorizing)
    input_eval = [word2idx(s) for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)


    # Here set batch size == 1
    model.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        #Ppredict the next word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Feed the predicted word as the next input to the model
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2word(predicted_id))

    return (start_string + text_generated)

Initializes a progress bar for debug purposes

In [None]:
from IPython.display import HTML, display
import time

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

Loop to generate multiple recipes and save to file

In [None]:
import pickle
headers = ["Title","Ingredients", "Instructions"]

count = 0


def generateRecipes(temp, k, print_recipe=False):
  out = display(progress(0, k), display_id=True)
    
  generated_recipes = []

  for i in range(0, k):
      try:
        rec = generate_text(model_prediction, start_string=["Title"], temperature=temp)
        generated_recipes.append(tokens2sentences(rec))
      except:
        print("Faied to generate recipe")
        
      out.update(progress(i, k)) #Progress bar
      if print_recipe:
        print(tokens2sentences(rec))
        print("---------------------------------------")


  with open("/content/drive/MyDrive/Colab files/Output/generated_recipes" + str(temp) + ".txt", "wb") as fp:   #Pickling
        pickle.dump(generated_recipes, fp)

  print("Recipes written to file: " + "generated_recipes" + str(temp) + ".txt")
  return generated_recipes

In [None]:
generateRecipes(temp=1.2, k=1000, print_recipe=False)

In [None]:
temp = [1.2, 1.4, 1.0]

for t in temp:
  generateRecipes(temp=t, k=1000)

Recipes written to file: generated_recipes1.2.txt


Recipes written to file: generated_recipes1.4.txt


Faied to generate recipe
Recipes written to file: generated_recipes1.0.txt


#Evaluation

In [None]:
def parseSections(s_list):
    s = s_list.split()
    sub1 = s_list.split("Ingredients")
    sub2 = sub1[1].split("Instructions")
    title = sub1[0].replace("Title", "").replace("\n", "")
    ingredients = sub2[0].replace("Ingredients", "").replace("•", ",")
    if len(sub2) == 2:
        instructions = sub2[1].replace("Instructions", "").replace("\n", " ")
        return (title, ingredients, instructions)
    return (None, None, None)

Check if recipe contains Title, Ingredients, Instructions

In [None]:
def evaluateRecipe(recipes):
  score = 0
  for rec in recipes:
    temp_score = 0
    title, ingredients, instructions = parseSections(rec)
    if title and ingredients and instructions:
      #temp_score += 0.5
      tokens = rec.split()
      keyword = tokens[1]
      if keyword in title:
        temp_score += 1
    #print(temp_score)
      else:
        print("MISSING INGREDIENT IN INGREDIENTS")
        print(rec)
    score += temp_score
  return score/len(recipes)

evaluateRecipe(generated_recipes)

1.0

## Evaluation
1. Recipe should contain Title, Ingredients, Instructions
2. Recipes that contains ingredients in the title should also exists in the Ingredients. 
3. Ingredients mentioned in the instructions should also exist in the Ingredients.

Load the ingredients dataset

In [None]:
import spacy
import numpy as np
import en_core_web_lg
import pandas as pd
nlp = en_core_web_lg.load()

#Source https://dominikschmidt.xyz/simplified-recipes-1M/
with np.load('/content/drive/MyDrive/Colab files/simplified-recipes-1M.npz', allow_pickle=True) as data:
    ingredients = data['ingredients']
print(type(ingredients))

<class 'numpy.ndarray'>


#### Convert ingredients from numpy.ndarray to list.str

In [None]:
def numpyToList(l):
    ingredients = []
    for i in l:
      ingredients.append(str(i))
    return ingredients

ingredients_list = numpyToList(ingredients)
print(type(ingredients_list))

<class 'list'>


In [None]:
for ing in ingredients_list[:5]:
  print(ing)

salt
pepper
butter
garlic
sugar


#### Disable unused NLP piplines to increase performance.

In [None]:
disabled = nlp.disable_pipes("ner", "parser")

### Clean ingredients dataset to only contain base-ingredients (nouns) (e.g. only "chicken" and not "boneless chicken").

In [None]:
color = ["red", "brown", "black", "white", "green", "yellow", "blue"] #Colors is identified as NOUNS by the nlp() so a manual list was needed.
bad_pos = ["ADJ", "VERB", "ADVERB"]
#Bad words are words that have missclassified p
bad_words = ["mochi", "confit", "morsels", "carcass", "color", "flavouring", "claws", "calories", "brazil", "alcohol", "splenda", "franks","colors", "season", "dry", "glazed", "smoke", "smoked", "salted", "mix", "fat", "ingredients", "a", "mix", "ingredients", "vegetable", "an", "st", "m", "halfandhalf", "easy", "white", "roasted", "blend", "quick", "asian","half", "soft", "ground", "iron","chile","wild","plastic","blue","clean","dried","extra","yellow","indian","spring","chiles","steamer","mediterranean","swiss","smooth","pink","hand","waxed","instant","rich","prepared","blended","sponge","hands","combined","bottle","smoked","spam","smoking","flavor"]

def filterNonIngredients(ingredients_list):
    for i in ingredients_list:
      if len(i.split()) > 1:
        ingredients_list.remove(i)
      else:
        doc = nlp(i)
        for token in doc:
          if token.pos_ in bad_pos or token.text.lower() in color or token.text.lower() in bad_words or token.is_stop:
            print(i, end=", ")
            ingredients_list.remove(i)
    return ingredients_list

prev = 0
while len(ingredients_list) != prev:
    prev = len(ingredients_list)
    ingredients_list = filterNonIngredients(ingredients_list)

print("\n\n" + "Ingredients removed: " + str(len(ingredients) - len(ingredients_list)))
print("Updated number of ingredients in list: " + str(len(ingredients_list)))

ground, sliced, brown, vegetable, packed, extra, boneless, black, toasted, green, red, shortening, more, bottle, flavor, other, chiles, unbleached, mix, filling, flavored, concentrate, preserves, chunky, splenda, spring, blend, raw, organic, natural, soft, asian, flaked, distilled, season, wholewheat, franks, fresh, whole, artificial, swiss, wild, clean, no, ingredients, much, hand, indian, pasteurized, plastic, colors, mexicorn, curds, confit, sole, spam, alcohol, mediterranean, bleached, carcass, iron, squirt, mochi, st, m, claws, white, kosher, unsweetened, dressing, instant, roasted, dry, prepared, a, halfandhalf, chile, curry, chopped, quick, salted, smooth, crumbles, blue, pink, batter, imported, leaving, turkish, concentrated, rub, papayas, dehydrated, easy, unrefined, smoke, calories, sicilian, rich, steamer, garnish, yellow, sweetened, half, blanched, any, fat, savory, carbonated, sponge, combined, brazil, citric, color, vegan, morsels, 

Ingredients removed: 2638
Updated numb

list

###Change each ingredient into its singular form
- Also removes false ingredients, e.g. soup which most likely are the result of a recipe not an ingredient in the recipe.

Reads false_ingredients_list from file. These ingredients are manually picked, after producing the frequency list at the end of this file

In [None]:
import csv
false_ingredients_list = []
with open('/content/drive/MyDrive/Colab files/false_ing.csv', 'r') as fd:
    reader = csv.reader(fd)
    for row in reader:
        if row != []:
          false_ingredients_list.append(row[0])

print(false_ingredients_list)
print("Length of false ingredients list:", len(false_ingredients_list))

['accent', 'accompaniment', 'american', 'bagel', 'baguette', 'baker', 'base', 'bread', 'breast', 'broiler', 'broth', 'brownie', 'browning', 'buffalo', 'cake', 'casing', 'chutney', 'coleslaw', 'cookie', 'croissant', 'crostini', 'crust', 'cupcake', 'custard', 'cutlet', 'dairy', 'dough', 'dripping', 'dumpling', 'ear', 'essence', 'fl', 'flavor', 'flavoring', 'flavour', 'frosting', 'glaze', 'gluten', 'granola', 'gravy', 'greens', 'green', 'guacamole', 'hummus', 'iceberg', 'icing', 'leaf', 'loaf', 'marinade', 'mousse', 'msg', 'muffin', 'oatmeal', 'palm', 'pancake', 'paneer', 'partially', 'protein', 'pudding', 'pulp', 'purple', 'roast', 'roll', 'round', 'salsa', 'sandiwch', 'seasoning', 'seed', 'shortbread', 'silver', 'skin', 'slaw', 'sorbet', 'soup', 'spice', 'stew', 'stuffing', 'swede', 'temperature', 'toast', 'topping', 'twist', 'vegetable', 'vegetarian', 'veggie', 'vinaigrette', 'waffle', 'salad', 'fruit', 'pesto', 'crema', 'puree', 'sauce', 'sandwitch', 'meatball', 'meatloaf', 'fiber', '

In [None]:
import inflect
p = inflect.engine()

new=[]
count = 0
exceptions_list = ["asparagus", "chips", "citrus", "couscous", "lemongrass", "oranges", "schnapps", "watercres", "leaves"]

ingredients_list_cleaned = []
def toSingular(token):
  token_singular = str(p.singular_noun(token.text.lower()))
  if token_singular != "False":
    ingredient = token_singular
  else:
    ingredient = token.text.lower()
  return ingredient

for i, ing in enumerate(ingredients_list):
  token = nlp(ing)
  if token.text.lower() in exceptions_list:
    ingredient = token.text.lower()
  else:
    ingredient = toSingular(token)
  
  if ingredient not in false_ingredients_list:
    ingredients_list_cleaned.append(ingredient)
    if ingredient != token.text.lower():
      count +=1


print("Plural words changed to singular:", count)
print("Lenght of old ingredients list:", len(ingredients_list))
print("Lenght of cleaned ingredients list:", len(ingredients_list_cleaned))
ingredients_list_cleaned[:10]

Plural words changed to singular: 154
Lenght of old ingredients list: 862
Lenght of cleaned ingredients list: 757


['salt',
 'pepper',
 'butter',
 'garlic',
 'sugar',
 'flour',
 'onion',
 'water',
 'olive',
 'egg']

#### Disable "tagger" pipeline since it's no longer used.

In [None]:
if len(disabled) == 2:
    disabled += nlp.disable_pipes("tagger")

Load training set


In [None]:
df_train = []
with open("/content/drive/MyDrive/Colab files/df_filtered.txt", 'rb') as fp:
      df_train = pickle.load(fp)

In [None]:
import random
df_train = random.sample(df_train, len(df_train)-1)
len(df_train)

91007

In [None]:
#salads_list = ["lettuce", "spinach", "romaine", "cabbage", "lettuce", "kale", "escarole", "endive"]

#Code for evaluation metric

In [None]:
ingredients_found = []

def checkForIngredients(token):
    global ingredients_found
    ing_found = False
    if token.is_alpha and not token.is_stop:
        ingredient = toSingular(token)

        if ingredient in ingredients_list_cleaned:
            ingredients_found.append(ingredient)
            ing_found = True

    if ing_found:
      return True
    else:
      return False


def allToSingular(header):
  words = []
  new_header = []

  for token in header:
    if token.text != " ":
      token_singular = str(p.singular_noun(token.text.lower()))
  
      if token_singular != "False":
        words.append(token_singular)
      else:
        words.append(token.text.lower())

  return words

ingredients_not_found = []
title_without_ing = 0 #Counts number of recipes that are not considered in the evaluation metric

def checkCond(header1, header2):
    global ingredients_found
    global ingredients_not_found
    global title_without_ing
    check = False
    doc = nlp(header1)
    doc2 = nlp(header2)
    score = 0
    ing_count = 0
    header2 = allToSingular(doc2) #Converts all ingredients into singular format

    for token in doc:
        if token.text != " ":
          if checkForIngredients(token):
              token_singular = toSingular(token)
              if token_singular == "vinaigrette":
                  token_singular = "vinegar"
              
              if token_singular in header2: 
                  score += 1
              else:
                  ingredients_not_found.append(token_singular)
              ing_count +=1

    if ing_count == 0:
      title_without_ing += 1 
      final_score = 1
    elif score == 0:
      final_score = 0
    else:
      final_score = score/ing_count 
    return final_score

Cleans ingredient list from verbs as ingredients. Verbs should only exist together with a ingredient noun eg. Green Pepper (VERB, NOUN).

### Try the evaluation where ingredients identified in "Title" are crosschecked with the "Ingredients" section.
Initially use this to identify ingredients in ingrediest_list that are not 
base ingredients. For example, "soup" is not really an ingredient, but something that is made with ingredients.

In [None]:
from IPython.display import clear_output

headers = ["Title", "Ingredients", "Instructions"]

def parseSections2(s_list):
    try:
      s = s_list.split()
 
      sub1 = s_list.split("Ingredients")
      sub2 = sub1[1].split("Instructions")
      title = sub1[0].replace("Title", "").replace("\n", "")
      ingredients = sub2[0].replace("Ingredients", "").replace("•", ",")

      if len(sub2) == 2:
          instructions = sub2[1].replace("Instructions", "").replace("\n", " ")
       
          return (title, ingredients, instructions)
    except:
      print(s_list)
      return (None, None, None)

def countX(lst, x):
    count = 0
    for ele in lst:
        if (ele == x):
            count = count + 1
    return count

def checkHeaders(recipe):
    s_list = recipe.split()
    missing_header = False
    sample_count = 0
    score = 0
    for h in headers:
      count = countX(s_list, h)
      #if count > 1:
        #print(count)
      if count > 0:
        score +=1
        sample_count+=count
      else:
        return 0.0

    return (score/sample_count)


def evaluateAndFilter(df):
    global ingredients_found
    global ingredients_not_found
    out = display(progress(0, len(df)-1), display_id=True)
  
    count = 0
    score1, score2, score3 = 0, 0, 0
    score_tot, score1_tot, score2_tot, score3_tot = 0 ,0 ,0, 0

    ingredients_found = []
    ingredients_not_found = []
    for i, row in enumerate(df):
        score1 = checkHeaders(row)
        score1_tot += score1
        if score1 > 0:
          try:
            title, ingredients, instructions = parseSections2(row)
            score2 = checkCond(title, ingredients) #Checks that title contains an ingredients, and that this ingredient exists in the ingredients section.
            score3 = checkCond(instructions, ingredients) #Checks that any ingredients mentioned in the instruction, exists in the ingredients section.
          except:
            pass
        #else:
            #print(row)

        score_tot += (score1 + score2 + score3) / 3
        score2_tot += score2
        score3_tot += score3
        
        out.update(progress(i, len(df)-1)) #Progress bar

    print("Score 1:", round(score1_tot/len(df), 2))
    print("Score 2:", round(score2_tot/len(df), 2))
    print("Score 3:", round(score3_tot/len(df), 2))
    print("Final Score: ", round(score_tot/len(df), 2))
    return df



In [None]:
import pickle
with open("/content/drive/MyDrive/Colab files/Output/generated_recipes1.4.txt", 'rb') as fp:
      df_generated_1_4 = pickle.load(fp)

with open("/content/drive/MyDrive/Colab files/Output/generated_recipes1.2.txt", 'rb') as fp:
      df_generated_1_2 = pickle.load(fp)

with open("/content/drive/MyDrive/Colab files/Output/generated_recipes1.0.txt", 'rb') as fp:
      df_generated_1_0 = pickle.load(fp)

with open("/content/drive/MyDrive/Colab files/Output/generated_recipes0.6.txt", 'rb') as fp:
      df_generated_0_6 = pickle.load(fp)



print(len(df_generated_1_4))
print(len(df_generated_1_2))
print(len(df_generated_1_0))
print(len(df_generated_0_6))

1000
1000
999
999


In [None]:
print("\nTemperature 1_4")
evaluateAndFilter(df_generated_1_4.copy())

print("Temperature 1.2")
evaluateAndFilter(df_generated_1_2.copy())

print("Temperature 1.0")
evaluateAndFilter(df_generated_1_0.copy())

print("\nTemperature 0.6")
evaluateAndFilter(df_generated_0_6.copy())


Temperature 1_4


Score 1: 0.82
Score 2: 0.42
Score 3: 0.2
Final Score:  0.48
Temperature 1.2


Score 1: 0.89
Score 2: 0.52
Score 3: 0.3
Final Score:  0.57
Temperature 1.0


Score 1: 0.91
Score 2: 0.59
Score 3: 0.39
Final Score:  0.63

Temperature 0.6


Score 1: 0.88
Score 2: 0.63
Score 3: 0.54
Final Score:  0.68


["\n\nTitle mediterranean squid stew\n\nIngredients\n• 1 pound eggplant, 10x15-inch or cut into 1/2-inch pieces\n• 2 tablespoons olive oil\n• 1 cup diced onion\n• 2 garlic cloves, machaca, shallow-fry in 1 or 2 pieces\n• 1 cup crushed tomatoes\n• 1/4 cup chopped fresh oregano leaves\n• 1 tablespoon fresh oregano leaves\n• 1 teaspoon salt\n• 1/2 teaspoon pepper\n• 1 medium white onion, arugula-mango, kadai, d'orschwihr, protrudes out of the tomatoes\n• 1 teaspoon chopped garlic\n• 1 can white beans\n• 2 tablespoons chopped fresh oregano leaves\n• 1 teaspoon salt\n• 2 teaspoons ground black pepper\n• 2 teaspoons ground white pepper\n• 1 teaspoon ground cumin\n• 1 teaspoon ground coriander\n• 1 teaspoon ground cumin\n• 1 tablespoon fresh oregano leaves\n• 1 cup red wine",
 '\n\nTitle linguini with clams and manila clams\n\nIngredients\n• 4 cups linguini\n• 3 tablespoons extra-virgin olive oil\n• 3 cloves garlic, 20-inch long razor clams\n• 4 manila clams\n• 2 pounds mussels\n• 1 teaspoon 

In [None]:
new_df = evaluateAndFilter(df_train[:1000].copy())

Avg Score 1: 1.0
Avg Score 2: 0.8874166666666665
Avg score 3: 0.8590190090678422
Avg Full Score:  0.9154785585781712


In [None]:
print(len(new_df))

NameError: ignored

**NOTE:** BELOW PART IS DONE ONCE TO FILTER CERTAIN INGREDIENTS (PRODUCING THE FALSE_ING.CSV). THE EVALUATION ABOVE IS THEN WHAT IS USED FOR THE FINAL METRIC

###Retrieve the frequency of ingredients that are identified both as negative and positive.
- **Negative Ingredients** = Ingredients found in the title but NOT among the ingredients
- **Positive Ingredients**   = Ingredients found in the title and among the ingredients

In [None]:
from collections import Counter
negative_ing_count = dict(Counter(ingredients_not_found).most_common())
positive_ing_count = dict(Counter(ingredients_found).most_common())

print("Not evaluated recipes: ", title_without_ing)
#print(word_counts)
print("Length Negative: ", len(negative_ing_count))
print(negative_ing_count)

print("Length Positive: ", len(positive_ing_count))
print(positive_ing_count)

print("Extracting all ingredients identified in recipe titles.")
print("Word, negative, positive ")
for word in negative_ing_count:
   #print(positive_word_counts[word])
 if positive_ing_count[word] is not None:
    print(str(word) + ", " + str(negative_ing_count[word]), ", " + str(positive_ing_count[word]))

#Extracting ingredients that was only identified in the title but not among the ingredients
for word in negative_ing_count:
   #print(positive_word_counts[word])
 if positive_ing_count[word] is not None:
    print(str(word) + ", " + str(negative_ing_count[word]), ", " + str(positive_ing_count[word]))


Not evaluated recipes:  2876
Length Negative:  290
{'salad': 662, 'sandwich': 144, 'burger': 69, 'herb': 66, 'pasta': 54, 'fruit': 49, 'tart': 49, 'caramel': 47, 'pesto': 44, 'chip': 43, 'chocolate': 42, 'ice': 42, 'fish': 39, 'biscuit': 39, 'risotto': 38, 'meatloaf': 37, 'citru': 36, 'cream': 35, 'crab': 32, 'chili': 30, 'relish': 29, 'fudge': 29, 'meatball': 29, 'cheese': 26, 'berry': 26, 'nut': 25, 'beef': 25, 'steak': 23, 'corn': 23, 'tea': 23, 'hummu': 20, 'seafood': 19, 'syrup': 19, 'stock': 18, 'lemonade': 17, 'meringue': 17, 'root': 16, 'candy': 16, 'jam': 16, 'sausage': 15, 'puree': 15, 'noodle': 14, 'pickle': 14, 'gluten': 14, 'topping': 14, 'aioli': 13, 'truffle': 13, 'gumbo': 12, 'eggnog': 11, 'rib': 11, 'crouton': 10, 'pastry': 10, 'potato': 10, 'skin': 10, 'coffee': 10, 'crumb': 10, 'tapenade': 10, 'ginger': 10, 'chickpea': 9, 'chicken': 9, 'meat': 9, 'butter': 9, 'adobo': 9, 'hamburger': 8, 'apple': 8, 'spaghetti': 8, 'polenta': 8, 'parmesan': 8, 'rice': 8, 'twist': 7, '

#Extracting Negative Ingredients that does not occur in the Positive Ingredients

In [None]:
#Extracting ingredients that was only identified in the title but not among the ingredients
for word in negative_word_counts:
   #print(positive_word_counts[word])
  if word not in positive_word_counts:
      print(str(word) + ", " + str(negative_word_counts[word]))


In [None]:
ingredients_list_old = ingredients_list