In [1]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import pandas as pd
import re
import nltk
from nltk.corpus.reader import TITLE

df = pd.read_csv("../data/recipes.csv")

lemmatizer = WordNetLemmatizer()
base_stopwords = set(stopwords.words('english'))

units = {
    "cup", "cups", "ts", "tsp", "teaspoon", "tbsp", "tablespoon",
    "oz", "ounce", "lb", "pound", "g", "gram", "kg", "ml", "l", "liter",
    "pint", "quart", "gallon", "inch", "diameter",
    "pinch", "dash", "clove", "sprig", "stick", "head", "bunch",
    "slice", "piece", "chunk", "part", "portion",
    "can", "canned", "jar", "package", "packet", "box", "bag", "bottle"
}

adjectives = {
    "large", "small", "medium", "tiny", "huge", "whole",
    "hot", "cold", "warm", "boiling", "room", "temperature",
    "fresh", "dry", "dried", "frozen", "thawed",
    "organic", "kosher", "virgin", "extra",
    "lean", "fat",
    "good", "best", "quality", "fine", "finely", "coarsely",
    "sturdy", "attached", "flat", "rotten", "ripe", "ripened",
    "storebought", "homemade", "preferably",
    "new", "old", "sharp", "mild", "soft", "hard"
}

methods = {
    "chopped", "diced", "minced", "sliced", "grated", "peeled", "cored", "seeded",
    "shredded", "crushed", "mashed", "ground", "beaten", "whisked", "stirred",
    "cooked", "roasted", "grilled", "baked", "fried", "boiled", "steamed", "poached",
    "melted", "softened", "divided", "separated", "removed", "discarded",
    "patted", "drained", "rinsed", "washed", "stuffed", "dressed", "trimmed",
    "boneless", "skinless", "skin", "cured", "preserved", "pitted", "halved", "quartered",
    "cut", "torn", "broken", "carcass"
}

context_fillers = {
    "optional", "garnish", "serving", "taste", "accompaniment",
    "plus", "more", "total", "about", "approx", "exceed",
    "seasoning", "preparation", "finish", "finishing", "topping"
}

colors = {
     "black", "green", "yellow", "blue", "brown", "orange", "pink"
}

brands_trash = {
    "lindt", "perugina", "ghirardelli", "gala", "lady", "fed", "grass"
}

smart_stopwords = base_stopwords.union(units, adjectives, methods, context_fillers, colors, brands_trash)


def preprocess_ingredients_smart(ingredients):

    try:
        if isinstance(ingredients, str):
            ingredients = ingredients.strip("[]").replace("'", "").split(', ')
        elif isinstance(ingredients, list):
            ingredients = ingredients
        else:
            return []
    except:
        return []

    cleaned_tokens = []
    for item in ingredients:
        text = item.lower()
        text = re.sub(r'\([^)]*\)', '', text)
        #remove numbers, fraction symbols
        text = re.sub(r'[\d½¾¼⅓⅔⅛⅜⅝⅞]+', '', text)
        #remove everything that is not a letter
        text = re.sub(r'[^a-z\s]', ' ', text)

        tokens = nltk.word_tokenize(text)
        #tags for the part of speech of the word
        tagged_tokens = nltk.pos_tag(tokens)
        #check if the word is a Nomen (NN) or Adjective (JJ) and lemmatize it, if not in stopwords then append to the array
        for word, tag in tagged_tokens:
            if tag.startswith('NN') or tag.startswith('JJ'):
                lemma = lemmatizer.lemmatize(word)
                if lemma not in smart_stopwords and len(lemma) > 2:
                    cleaned_tokens.append(lemma)

    return cleaned_tokens

In [2]:
try:
    df = pd.read_csv("../data/recipes.csv")
    print("✅ Daten erfolgreich geladen!")
    print(f"Datensatz hat {df.shape[0]} Zeilen und {df.shape[1]} Spalten.")
except FileNotFoundError:
    print("FEHLER: Die CSV-Datei wurde nicht gefunden.")

✅ Daten erfolgreich geladen!
Datensatz hat 13501 Zeilen und 6 Spalten.


In [3]:
#nltk.download('punkt_tab')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('averaged_perceptron_tagger_eng')


#part of speech tagging
print("start preprocessing with pos tagging...")

df['ingredients_smart'] = df['Cleaned_Ingredients'].apply(preprocess_ingredients_smart)


print("\n--- results  ---")
for i in range(3):
    print(f"RAW: {df['Ingredients'].iloc[i][:100]}...")
    print(f"SMART: {df['ingredients_smart'].iloc[i]}")
    print("-" * 20)

start preprocessing with pos tagging...

--- results  ---
RAW: ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (abo...
SMART: ['chicken', 'salt', 'acorn', 'squash', 'sage', 'rosemary', 'butter', 'allspice', 'red', 'pepper', 'flake', 'pepper', 'white', 'bread', 'apple', 'olive', 'oil', 'red', 'onion', 'apple', 'cider', 'vinegar', 'white', 'miso', 'flour', 'butter', 'white', 'wine', 'chicken', 'broth', 'white', 'miso', 'salt', 'pepper']
--------------------
RAW: ['2 large egg whites', '1 pound new potatoes (about 1 inch in diameter)', '2 teaspoons kosher salt',...
SMART: ['egg', 'white', 'potato', 'salt', 'pepper', 'rosemary', 'thyme', 'parsley']
--------------------
RAW: ['1 cup evaporated milk', '1 cup whole milk', '1 tsp. garlic powder', '1 tsp. onion powder', '1 tsp....
SMART: ['milk', 'milk', 'garlic', 'powder', 'onion', 'powder', 'paprika', 'pepper', 'salt', 'cheddar', 'full', 'cream', 'cheese', 'elbow', 'macaroni']
------------------

In [4]:
from gensim.models.phrases import Phraser
from gensim.models import Phrases

print("start training with bigrams")

phrases_model_smart = Phrases(df['ingredients_smart'], min_count=10, threshold=0.4, scoring="npmi")
bigram_model = Phraser(phrases_model_smart)


def apply_bigrams(tokens):
    return bigram_model[tokens]

df['ingredients_bigrams'] = df['ingredients_smart'].apply(apply_bigrams)


print("\n--- Beispiel für erkannte Bigramme ---")
for i in range(len(df)):
    if df['ingredients_smart'].iloc[i] != df['ingredients_bigrams'].iloc[i]:
        print(f"Vorher: {df['Ingredients'].iloc[i]}")
        print(f"Nachher: {df['ingredients_bigrams'].iloc[i]}")
        print("-" * 30)
        break

start training with bigrams

--- Beispiel für erkannte Bigramme ---
Vorher: ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature', '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper', '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)', '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces', '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced', '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour', '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine', '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt, freshly ground pepper']
Nachher: ['chicken', 'salt', 'acorn_squash', 'sage', 'rosemary', 'butter', 'allspice', 'red_pep

In [5]:
import os
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import pickle
import os

print("start word2vec training...")
cores = multiprocessing.cpu_count()

model = Word2Vec(df['ingredients_bigrams'],workers=cores-4, vector_size=150, window=10, min_count=5, sg=1)

os.makedirs("../data/models", exist_ok=True)

model_path = "../data/models/word2vec.model"
model.save(model_path)

print(f"model saved under: {model_path}")


ingredient_vectors = {}

for i, row in df.iterrows():
    ingredients = row['ingredients_bigrams']

#iterate over the tokenised bigrams and check if the word2vec-model (model.wv=word vector) recognizes the word -> if yes take the numbers (wv) and put it in the array [vectors]
    vectors = []
    for word in ingredients:
        if word in model.wv:
            vectors.append(model.wv[word])

#calculate the mean
    if vectors:
        avg_vector = np.mean(vectors, axis=0)
        ingredient_vectors[i] = avg_vector



folder_path = "../data/models"
pkl_path = os.path.join(folder_path, "ingredient_vectors.pkl")


with open(pkl_path, "wb") as f:
    pickle.dump(ingredient_vectors, f)
    print(f"model saved under: {pkl_path}")


start word2vec training...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


model saved under: ../data/models/word2vec.model
model saved under: ../data/models\ingredient_vectors.pkl


In [6]:
loaded_model_smart = Word2Vec.load(model_path)

def check_smart(term, model):
        similar = model.wv.most_similar(term, topn=5)
        print(f"\n Similarity or Alternative to '{term}':")
        for item, score in similar:
            print(f"  -> {item} ({score:.2f})")


check_smart("chicken", loaded_model_smart)
check_smart("beef", loaded_model_smart)
check_smart("chocolate", loaded_model_smart)
check_smart("spaghetti", loaded_model_smart)
check_smart("tomato", loaded_model_smart)
check_smart("oil", loaded_model_smart)
#check_smart("cook", loaded_model_smart)



 Similarity or Alternative to 'chicken':
  -> drumstick (0.84)
  -> wing (0.83)
  -> leg_thigh (0.81)
  -> backbone (0.81)
  -> breast (0.81)

 Similarity or Alternative to 'beef':
  -> beef_chuck (0.82)
  -> roast (0.81)
  -> meaty (0.79)
  -> sirloin (0.77)
  -> chuck (0.77)

 Similarity or Alternative to 'chocolate':
  -> chocolate_chip (0.93)
  -> bittersweet_semisweet (0.91)
  -> semisweet_bittersweet (0.89)
  -> bittersweet_chocolate (0.89)
  -> bar (0.89)

 Similarity or Alternative to 'spaghetti':
  -> rigatoni (0.92)
  -> bucatini (0.92)
  -> linguine (0.92)
  -> orecchiette (0.92)
  -> broccoli_rabe (0.92)

 Similarity or Alternative to 'tomato':
  -> plum_tomato (0.81)
  -> oregano (0.77)
  -> rom_tomato (0.76)
  -> pimiento (0.76)
  -> undrained (0.74)

 Similarity or Alternative to 'oil':
  -> neutral (0.70)
  -> canola (0.69)
  -> grapeseed (0.67)
  -> stir (0.66)
  -> sunflower (0.66)
