In [1]:
import os
import pandas as pd
import re
import matplotlib.pyplot as plt

IMG_FOLDER = os.path.join("..", "data", "images", "Food Images")

if os.path.exists(IMG_FOLDER):
    print("Images folder exists")
    print(f"Anzahl: {len(os.listdir(IMG_FOLDER))}")
    df = pd.read_csv("../data/recipes.csv")
    print("✅ Daten erfolgreich geladen!")
else:
    print("Images folder doesn't exist")


Images folder exists
Anzahl: 13582
✅ Daten erfolgreich geladen!


In [2]:
import re
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

print("start Preprocessing...")
lemmatizer = WordNetLemmatizer()
stopwords_list = stopwords.words('english')

measurements = {
    "cup", "cups", "ts", "tsp", "teaspoon", "tbsp", "tablespoon",
    "oz", "ounce", "lb", "pound", "g", "gram", "kg", "ml", "l", "liter",
    "pinch", "dash", "slice", "can", "jar", "package",
    "large", "small", "medium", "whole", "inch", "diameter", "total",
    "stick", "full", "piece", "rotisserie", "roast", "roasted", "grilled", "baked",
    "wing", "breast", "thigh", "leg", "bone", "boneless", "skinless", "skin",
    "lean", "chuck", "sirloin", "ground", "minced", "pint", "quart", "gallon",
    "drumstick", "drumsticks", "drumette", "drumettes", "ripe", "ripened", "chunk"
}

cooking_methods = {
    "chopped", "diced", "minced", "sliced", "grated", "peeled", "cored",
    "finely", "coarsely", "freshly", "ground", "divided", "plus", "more",
    "unsalted", "salted", "taste", "room", "temperature", "melted", "softened",
    "cut", "shredded", "cooked", "high", "low", "medium", "exceed", "stuffed", "stuffing",
    "patted"
}

other_stopwords = {
    "gala", "pink", "lady", "new", "extra", "sharp", "fat", "new", "extra",
    "sharp", "virgin", "good", "quality", "sturdy", "torn", "storebought",
    "homemade", "removed", "casing", "semisweet", "bittersweet", "unsweetened",
    "halved", "quartered", "pitted", "cured", "brine", "preserved",
    "sun", "dried", "vine", "ripened", "wafer", "chip", "chips", "short", "long",
    "preferably", "optional", "garnish", "about", "lindt", "perugina", "ghirardelli",
    "attached", "flat", "fed", "grass", "drained", "tiny", "seasoning", "picholine", "cerignola",
    "dry", "fresh", "frozen", "canned"}

all_stopwords = set(stopwords_list).union(measurements).union(cooking_methods).union(other_stopwords)


def preprocess_ingredients(ingredients):
    try:  #remove the square brackets and quotation marks, separate at commas
        ingredients = ingredients.strip("[]").replace("'", "").split(', ')
    except:
        return []

    cleaned_tokens = []
    for item in ingredients:
        text = item.lower()  #lowercase
        text = re.sub(r'[\d½¾¼⅓⅔⅛⅜⅝⅞]+', '', text)  #remove numbers, fraction symbols
        text = re.sub(r'[^a-z\s]', ' ', text)  #replace everything that is not a letter with a space

        tokens = text.split()
        for token in tokens:
            lemma = lemmatizer.lemmatize(token)
            if lemma not in all_stopwords and len(lemma) > 2:
                cleaned_tokens.append(lemma)
    return cleaned_tokens


df['ingredients_for_w2v'] = df['Ingredients'].apply(preprocess_ingredients)

print("✅ Preprocessing abgeschlossen!")

# --- Finaler Test ---
print("\n--- Visueller Vergleich nach neuer Bereinigung ---")
for i in range(3):
    print(f"RAW:     {df['Ingredients'].iloc[i]}")
    print(f"NEU:     {df['ingredients_for_w2v'].iloc[i]}")
    print("-" * 20)




start Preprocessing...
✅ Preprocessing abgeschlossen!

--- Visueller Vergleich nach neuer Bereinigung ---
RAW:     ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature', '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper', '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)', '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces', '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced', '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour', '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine', '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt, freshly ground pepper']
NEU:     ['chicken', 'kosher', 'salt', 'acorn', 'squash'

In [3]:
from gensim.models.phrases import Phrases, Phraser

print("start bigramm modell...")
common_suffixes = {"powder", "oil", "sauce", "cheese", "cream", "milk", "butter"}
phrases_model = Phrases(df['ingredients_for_w2v'], min_count=5, threshold=0.2,
                        scoring="npmi")  #Combinations that apppear less then 5 times will be ignored (min)
bigram_model = Phraser(phrases_model)  #phraser takes only the neccessary parts, smaller in the ram as above


def apply_bigrams(tokens):
    return bigram_model[tokens]


df['ingredients_bigrams'] = df['ingredients_for_w2v'].apply(apply_bigrams)

print("\n--- Vorher / Nachher Vergleich ---")
count = 0
for i in range(len(df)):
    alt = df['ingredients_for_w2v'].iloc[i]
    neu = df['ingredients_bigrams'].iloc[i]

    if alt != neu:
        print(f"Alt: {alt}")
        print(f"Neu: {neu}")
        print("-" * 30)
        count += 1
        if count >= 5: break



start bigramm modell...

--- Vorher / Nachher Vergleich ---
Alt: ['chicken', 'kosher', 'salt', 'acorn', 'squash', 'sage', 'rosemary', 'butter', 'allspice', 'crushed', 'red', 'pepper', 'flake', 'black', 'pepper', 'loaf', 'white', 'bread', 'apple', 'olive', 'oil', 'red', 'onion', 'thinly', 'apple', 'cider', 'vinegar', 'white', 'miso', 'purpose', 'flour', 'butter', 'white', 'wine', 'chicken', 'broth', 'white', 'miso', 'kosher', 'salt', 'pepper']
Neu: ['chicken', 'kosher_salt', 'acorn_squash', 'sage_rosemary', 'butter', 'allspice', 'crushed_red', 'pepper_flake', 'black_pepper', 'loaf', 'white_bread', 'apple', 'olive_oil', 'red_onion', 'thinly', 'apple_cider', 'vinegar', 'white_miso', 'purpose_flour', 'butter', 'white_wine', 'chicken_broth', 'white_miso', 'kosher_salt', 'pepper']
------------------------------
Alt: ['egg', 'white', 'potato', 'kosher', 'salt', 'black', 'pepper', 'rosemary', 'thyme', 'parsley']
Neu: ['egg_white', 'potato', 'kosher_salt', 'black_pepper', 'rosemary_thyme', 'par

In [4]:
from gensim.models import Word2Vec
import multiprocessing
import os

print("start final word2vec training")

cores = multiprocessing.cpu_count()

model = Word2Vec(df['ingredients_bigrams'], workers=cores - 1, vector_size=200, window=10, min_count=5, sg=1)

model_path = "../data/models/recipe_word2vec.model"
#model.save(model_path)
print(f"model trained and saved under: {model_path}")


start final word2vec training


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


model trained and saved under: ../data/models/recipe_word2vec.model


In [6]:

def check(term):
    try:
        similar = model.wv.most_similar(term, topn=5)
        print(f"\nAlternativen zu '{term}'?")
        for item, score in similar:
            print(f"  -> {item} ({score:.2f})")
    except KeyError:
        print(f"\n❌ '{term}' kenne ich nicht.")

# Teste die neuen Bigrams!
check("chicken")
check("beef")
check("chocolate")
check("spaghetti")
check("tomato")
check("olive")

Index(['Unnamed: 0', 'Title', 'Ingredients', 'Instructions', 'Image_Name',
       'Cleaned_Ingredients', 'ingredients_for_w2v', 'ingredients_bigrams'],
      dtype='object')

Alternativen zu 'chicken'?
  -> chicken_turkey (0.81)
  -> cajun (0.81)
  -> dark_meat (0.80)
  -> top_round (0.79)
  -> poultry (0.78)

Alternativen zu 'beef'?
  -> hot_spanish (0.85)
  -> slab (0.85)
  -> pork (0.84)
  -> paprika_hot (0.84)
  -> bottle_dark (0.84)

Alternativen zu 'chocolate'?
  -> milk_chocolate (0.94)
  -> white_chocolate (0.92)
  -> dark_chocolate (0.90)
  -> chocolate_cacao (0.90)
  -> bar (0.90)

Alternativen zu 'spaghetti'?
  -> rigatoni (0.94)
  -> capellini (0.93)
  -> bucatini (0.93)
  -> accompaniment_parmigiano (0.93)
  -> pecorino (0.92)

Alternativen zu 'tomato'?
  -> undrained (0.80)
  -> piquillo_pepper (0.79)
  -> san_marzano (0.78)
  -> grape_cherry (0.77)
  -> tomato_seeded (0.77)

Alternativen zu 'olive'?
  -> kalamata (0.85)
  -> caper (0.81)
  -> spanish (0.80)
  -> kalamata