In [1]:
import nltk
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import multiprocessing
import re
from nltk import WordNetLemmatizer
df = pd.read_csv("../data/recipes.csv")

lemmatizer = WordNetLemmatizer()


df = df.reset_index(drop=True)
print(df.columns)
df = df.drop(columns=['Unnamed: 0'])

def preprocess_instructions(instructions):
    try:  #remove the square brackets and quotation marks, separate at commas
        instructions = instructions.strip("[]").replace("'", "").split(', ')
    except:
        return []

    cleaned_tokens = []
    for item in instructions:
        text = item.lower()  #lowercase
        text = re.sub(r'[\d½¾¼⅓⅔⅛⅜⅝⅞]+', '', text)  #remove numbers, fraction symbols
        text = re.sub(r'[^a-z\s]', ' ', text)  #replace everything that is not a letter with a space

        tokens = text.split()
        for token in tokens:
            lemma = lemmatizer.lemmatize(token)
            if len(lemma) > 2:
                cleaned_tokens.append(lemma)
    return cleaned_tokens




Index(['Unnamed: 0', 'Title', 'Ingredients', 'Instructions', 'Image_Name',
       'Cleaned_Ingredients'],
      dtype='object')


In [2]:
from gensim.models.doc2vec import TaggedDocument

print("start preprocessing")

#create processed instructions row in dataset (saved in ram)
df['instructions_processed'] = df['Instructions'].apply(preprocess_instructions)

print("finished preprocessing for recipes")

#reset index, for id´s even is we got the C1 in the dataset
df = df.reset_index(drop=True)

#gensim needs the taggedoc, doesn´t work with the id or C1 from pandas
print("create taggeddocuments")
tagged_data = [
    TaggedDocument(words=row['instructions_processed'], tags=[str(i)])
    for i, row in df.iterrows()
]

print(f"   -> {len(tagged_data)} recipes prepared")


start preprocessing
finished preprocessing for recipes
create taggeddocuments
   -> 13501 recipes prepared


In [3]:
cores = multiprocessing.cpu_count()
print("Cores:" , cores)
print(df.columns)

print("Initialize Doc2Vec Model")
model = Doc2Vec(vector_size=150, window=5, min_count=5, dm=1, epochs=10, workers=cores-4)




Cores: 20
Index(['Title', 'Ingredients', 'Instructions', 'Image_Name',
       'Cleaned_Ingredients', 'instructions_processed'],
      dtype='object')
Initialize Doc2Vec Model


In [4]:
#count the words, create structure
model.build_vocab(tagged_data)
print("created vocabulary")

print("start training")
#train with the tagged data, recipe counts and the iterations
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


created vocabulary
start training


In [5]:
import os

folder_path = "../data/models"
os.makedirs(folder_path, exist_ok=True)


recipe_model_path = os.path.join(folder_path, "doc2vec.model")
model.save(recipe_model_path)
print(f"model saved to {recipe_model_path}")



pkl_path = os.path.join(folder_path, "recipe_vectors.pkl")

recipe_vectors = {
    i: model.dv[str(i)]
    for i in range(len(df))
}


#wb -> write binary
with open(pkl_path, 'wb') as f:
    pickle.dump(recipe_vectors, f)

    print(f"saved vector database{pkl_path}")

model saved to ../data/models\doc2vec.model
saved vector database../data/models\recipe_vectors.pkl


In [6]:
test_id = 100

title_original = df.iloc[test_id]['Title']

print(f"search similarity #{test_id}: '{title_original}'")
print("-" * 50)

sims = model.dv.most_similar(str(test_id), topn=5)

for doc_id, score in sims:
    idx = int(doc_id)
    title = df.iloc[idx]['Title']
    print(f"Score: {score:.3f} | ID: {idx} | {title}")

search similarity #100: 'Stuffed Eggplants and Zucchini in a Rich Tomato Sauce (Baatingan w Kusaa Bil Banadoura)'
--------------------------------------------------
Score: 0.602 | ID: 7652 | Meatballs: The Spuntino Way
Score: 0.593 | ID: 769 | Kimchi and Miso Noodle Soup
Score: 0.580 | ID: 5062 | Turkey Breast Stuffed with Italian Sausage and Marsala-Steeped Cranberries
Score: 0.565 | ID: 101 | Chicken Meatballs With Molokhieh, Garlic, and Cilantro
Score: 0.558 | ID: 2866 | Jollof Rice


In [7]:
import pickle
import numpy as np
import os

print("load recipe vectors")
folder_path = "../data/models"
file_path = os.path.join(folder_path, "recipe_vectors.pkl")

#rb -> read binary
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        loaded_vectors = pickle.load(f)

        print(f"loaded vectors from {file_path}")


    test_idx = 100
    vector = loaded_vectors[test_idx]

    print(f"Check für Rezept ID {test_idx}:")
    print(f" - Typ: {type(vector)}")
    print(f" - Form (Dimension): {vector.shape}")
    print(f" - Erste 5 Werte: {vector[:5]}")




load recipe vectors
loaded vectors from ../data/models\recipe_vectors.pkl
Check für Rezept ID 100:
 - Typ: <class 'numpy.ndarray'>
 - Form (Dimension): (150,)
 - Erste 5 Werte: [-0.6231667   0.4053907  -0.41491544  0.02189164 -0.93432814]


In [8]:
import pickle
import pandas as pd
df = pd.read_csv("../data/recipes.csv")

rows = ['Title', 'Ingredients', 'Instructions', 'Image_Name']
df_metadata = df[rows].copy()

path = "../data/models/metadata.pkl"


with open(path, 'wb') as f:
    pickle.dump(df_metadata, f)

    print("loaded metadata")

    print(df_metadata.head())

loaded metadata
                                               Title  \
0  Miso-Butter Roast Chicken With Acorn Squash Pa...   
1                    Crispy Salt and Pepper Potatoes   
2                        Thanksgiving Mac and Cheese   
3                 Italian Sausage and Bread Stuffing   
4                                       Newton's Law   

                                         Ingredients  \
0  ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...   
1  ['2 large egg whites', '1 pound new potatoes (...   
2  ['1 cup evaporated milk', '1 cup whole milk', ...   
3  ['1 (¾- to 1-pound) round Italian loaf, cut in...   
4  ['1 teaspoon dark brown sugar', '1 teaspoon ho...   

                                        Instructions  \
0  Pat chicken dry with paper towels, season all ...   
1  Preheat oven to 400°F and line a rimmed baking...   
2  Place a rack in middle of oven; preheat to 400...   
3  Preheat oven to 350°F with rack in middle. Gen...   
4  Stir together brown sugar a