In [1]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import pandas as pd
import re
import nltk


df = pd.read_csv("../data/recipes.csv")

lemmatizer = WordNetLemmatizer()
base_stopwords = set(stopwords.words('english'))

units = {
    "cup", "cups", "ts", "tsp", "teaspoon", "tbsp", "tablespoon",
    "oz", "ounce", "lb", "pound", "g", "gram", "kg", "ml", "l", "liter",
    "pint", "quart", "gallon", "inch", "diameter",
    "pinch", "dash", "clove", "sprig", "stick", "head", "bunch",
    "slice", "piece", "chunk", "part", "portion",
    "can", "canned", "jar", "package", "packet", "box", "bag", "bottle"
}

adjectives = {
    "large", "small", "medium", "tiny", "huge", "whole",
    "hot", "cold", "warm", "boiling", "room", "temperature",
    "fresh", "dry", "dried", "frozen", "thawed",
    "organic", "kosher", "virgin", "extra",
    "lean", "fat",
    "good", "best", "quality", "fine", "finely", "coarsely",
    "sturdy", "attached", "flat", "rotten", "ripe", "ripened",
    "storebought", "homemade", "preferably",
    "new", "old", "sharp", "mild", "soft", "hard"
}

methods = {
    "chopped", "diced", "minced", "sliced", "grated", "peeled", "cored", "seeded",
    "shredded", "crushed", "mashed", "ground", "beaten", "whisked", "stirred",
    "cooked", "roasted", "grilled", "baked", "fried", "boiled", "steamed", "poached",
    "melted", "softened", "divided", "separated", "removed", "discarded",
    "patted", "drained", "rinsed", "washed", "stuffed", "dressed", "trimmed",
    "boneless", "skinless", "skin", "cured", "preserved", "pitted", "halved", "quartered",
    "cut", "torn", "broken", "carcass"
}

context_fillers = {
    "optional", "garnish", "serving", "taste", "accompaniment",
    "plus", "more", "total", "about", "approx", "exceed",
    "seasoning", "preparation", "finish", "finishing", "topping"
}

colors = {
     "black", "green", "yellow", "blue", "brown", "orange", "pink"
}

brands_trash = {
    "lindt", "perugina", "ghirardelli", "gala", "lady", "fed", "grass"
}

smart_stopwords = base_stopwords.union(units, adjectives, methods, context_fillers, colors, brands_trash)


def preprocess_ingredients_smart(ingredients):

    try:
        if isinstance(ingredients, str):
            ingredients = ingredients.strip("[]").replace("'", "").split(', ')
        elif isinstance(ingredients, list):
            ingredients = ingredients
        else:
            return []
    except:
        return []

    cleaned_tokens = []
    for item in ingredients:
        text = item.lower()
        text = re.sub(r'\([^)]*\)', '', text)
        #remove numbers, fraction symbols
        text = re.sub(r'[\d½¾¼⅓⅔⅛⅜⅝⅞]+', '', text)
        #remove everything that is not a letter
        text = re.sub(r'[^a-z\s]', ' ', text)

        tokens = nltk.word_tokenize(text)
        tagged_tokens = nltk.pos_tag(tokens)
        #check if the word is a Nomen (NN) or Adjective (JJ) and lemmatize it, if not in stopwords then append to the array
        for word, tag in tagged_tokens:
            if tag.startswith('NN') or tag.startswith('JJ'):
                lemma = lemmatizer.lemmatize(word)
                if lemma not in smart_stopwords and len(lemma) > 2:
                    cleaned_tokens.append(lemma)

    return cleaned_tokens

In [2]:
try:
    df = pd.read_csv("../data/recipes.csv")
    print("✅ Daten erfolgreich geladen!")
    print(f"Datensatz hat {df.shape[0]} Zeilen und {df.shape[1]} Spalten.")
except FileNotFoundError:
    print("⚠️ FEHLER: Die CSV-Datei wurde nicht gefunden. Bitte Pfad überprüfen.")

✅ Daten erfolgreich geladen!
Datensatz hat 13501 Zeilen und 6 Spalten.


In [3]:
#nltk.download('punkt_tab')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('averaged_perceptron_tagger_eng')



print("Starte smartes Preprocessing mit POS-Tagging...")

df['ingredients_smart'] = df['Cleaned_Ingredients'].apply(preprocess_ingredients_smart)

print("✅ Smartes Preprocessing abgeschlossen!")

print("\n--- Beispiel-Ergebnis des smarten Preprocessing ---")
for i in range(3):
    print(f"RAW: {df['Ingredients'].iloc[i][:100]}...")
    print(f"SMART: {df['ingredients_smart'].iloc[i]}")
    print("-" * 20)

Starte smartes Preprocessing mit POS-Tagging...
✅ Smartes Preprocessing abgeschlossen!

--- Beispiel-Ergebnis des smarten Preprocessing ---
RAW: ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (abo...
SMART: ['chicken', 'salt', 'acorn', 'squash', 'sage', 'rosemary', 'butter', 'allspice', 'red', 'pepper', 'flake', 'pepper', 'white', 'bread', 'apple', 'olive', 'oil', 'red', 'onion', 'apple', 'cider', 'vinegar', 'white', 'miso', 'flour', 'butter', 'white', 'wine', 'chicken', 'broth', 'white', 'miso', 'salt', 'pepper']
--------------------
RAW: ['2 large egg whites', '1 pound new potatoes (about 1 inch in diameter)', '2 teaspoons kosher salt',...
SMART: ['egg', 'white', 'potato', 'salt', 'pepper', 'rosemary', 'thyme', 'parsley']
--------------------
RAW: ['1 cup evaporated milk', '1 cup whole milk', '1 tsp. garlic powder', '1 tsp. onion powder', '1 tsp....
SMART: ['milk', 'milk', 'garlic', 'powder', 'onion', 'powder', 'paprika', 'pepper', 'sa

In [4]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

print("Starte Training des Bigramm-Modells...")

phrases_model_smart = Phrases(df['ingredients_smart'], min_count=10, threshold=0.4, scoring="npmi")
bigram_model = Phraser(phrases_model_smart)


def apply_bigrams(tokens):
    return bigram_model[tokens]

df['ingredients_bigrams'] = df['ingredients_smart'].apply(apply_bigrams)


print("\n--- Beispiel für erkannte Bigramme ---")
for i in range(len(df)):
    if df['ingredients_smart'].iloc[i] != df['ingredients_bigrams'].iloc[i]:
        print(f"Vorher: {df['Ingredients'].iloc[i]}")
        print(f"Nachher: {df['ingredients_bigrams'].iloc[i]}")
        print("-" * 30)
        break

Starte Training des Bigramm-Modells...

--- Beispiel für erkannte Bigramme ---
Vorher: ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature', '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper', '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)', '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces', '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced', '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour', '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine', '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt, freshly ground pepper']
Nachher: ['chicken', 'salt', 'acorn_squash', 'sage', 'rosemary', 'butter', 'allspice

In [5]:
import os
import multiprocessing
from gensim.models import Word2Vec

print("Starte finales Word2Vec-Training...")

cores = multiprocessing.cpu_count()

model_smart = Word2Vec(df['ingredients_bigrams'],
                       workers=cores-1, vector_size=150, window=10, min_count=5, sg=1)

os.makedirs("../data/models", exist_ok=True)

model_path_smart = "../data/models/recipe_word2vec_smart.model"
model_smart.save(model_path_smart)

print(f"✅ Smartes Modell trainiert und gespeichert unter: {model_path_smart}")

Starte finales Word2Vec-Training...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


✅ Smartes Modell trainiert und gespeichert unter: ../data/models/recipe_word2vec_smart.model


In [6]:
loaded_model_smart = Word2Vec.load(model_path_smart)

def check_smart(term, model):
    try:
        similar = model.wv.most_similar(term, topn=5)
        print(f"\nAlternativen zu '{term}' (SMART-MODELL):")
        for item, score in similar:
            print(f"  -> {item} ({score:.2f})")
    except KeyError:
        print(f"\n❌ '{term}' kenne ich im smarten Modell nicht.")

check_smart("chicken", loaded_model_smart)
check_smart("beef", loaded_model_smart)
check_smart("chocolate", loaded_model_smart)
check_smart("spaghetti", loaded_model_smart)
check_smart("tomato", loaded_model_smart)
check_smart("oil", loaded_model_smart)


Alternativen zu 'chicken' (SMART-MODELL):
  -> drumstick (0.85)
  -> wing (0.82)
  -> breast (0.81)
  -> carcass (0.81)
  -> backbone (0.81)

Alternativen zu 'beef' (SMART-MODELL):
  -> beef_chuck (0.83)
  -> meaty (0.82)
  -> beef_stock (0.80)
  -> prime (0.80)
  -> roast (0.80)

Alternativen zu 'chocolate' (SMART-MODELL):
  -> bittersweet_semisweet (0.92)
  -> bar (0.91)
  -> semisweet_bittersweet (0.91)
  -> chocolate_chip (0.91)
  -> bittersweet_chocolate (0.90)

Alternativen zu 'spaghetti' (SMART-MODELL):
  -> broccoli_rabe (0.93)
  -> bucatini (0.92)
  -> orecchiette (0.92)
  -> rigatoni (0.91)
  -> perciatelli (0.90)

Alternativen zu 'tomato' (SMART-MODELL):
  -> plum_tomato (0.81)
  -> oregano (0.79)
  -> san_marzano (0.77)
  -> slow_cooker (0.77)
  -> bell (0.77)

Alternativen zu 'oil' (SMART-MODELL):
  -> neutral (0.69)
  -> sodium_tamari (0.65)
  -> canola (0.65)
  -> canola_oil (0.65)
  -> stir (0.65)
