# Import library

In [2]:
import pandas as pd
import numpy as np
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

In [3]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package average

True

# Load Dataset

In [5]:
df = pd.read_csv("Food Ingredients and Recipe Dataset with Image Name Mapping.csv")

# Data Pre-processing

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [8]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
13496,13496,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408,"['1 cup all-purpose flour', '2/3 cup unsweeten..."
13497,13497,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...,"['1 preserved lemon', '1 1/2 pound butternut s..."
13498,13498,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,['Leftover katsuo bushi (dried bonito flakes) ...
13499,13499,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344,['1 stick (1/2 cup) plus 1 tablespoon unsalted...
13500,13500,"Mexican Poblano, Spinach, and Black Bean ""Lasa...",['12 medium to large fresh poblano chiles (2 1...,Lay 4 chiles on their sides on racks of gas bu...,mexican-poblano-spinach-and-black-bean-lasagne...,['12 medium to large fresh poblano chiles (2 1...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           13501 non-null  int64 
 1   Title                13496 non-null  object
 2   Ingredients          13501 non-null  object
 3   Instructions         13493 non-null  object
 4   Image_Name           13501 non-null  object
 5   Cleaned_Ingredients  13501 non-null  object
dtypes: int64(1), object(5)
memory usage: 633.0+ KB


In [10]:
# Convert string 
df['Ingredients'] = df['Ingredients'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

In [11]:
# Storing in space-separated strings
df['Ingredients'] = df['Ingredients'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

In [12]:
df['Ingredients'] = df['Ingredients'].fillna('').astype(str)
df['Instructions'] = df['Instructions'].fillna('').astype(str)

# NLP

### WordNetLemmatizer

In [15]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# clean_text

In [17]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'[\u00BC-\u00BE\u2150-\u215E]', '', text)  # Remove fractions like ¼, ½, ¾
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    tokens = word_tokenize(text)  # Tokenization
    tagged_tokens = pos_tag(tokens)
    filtered_tokens = [word for word, tag in tagged_tokens if tag.startswith('NN')]
    processed_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(processed_tokens)



In [None]:
df['Clean_Ingredients'] = df['Ingredients'].apply(clean_text)
df['Clean_Instructions'] = df['Instructions'].apply(clean_text)

In [None]:
df.to_csv("Cleaned_Recipe_Dataset.csv", index=False)
df[['Title', 'Clean_Ingredients', 'Clean_Instructions']].head()

In [None]:
tokenized_ingredients = [recipe.split() for recipe in df['Clean_Ingredients']]

In [None]:
word2vec_model = Word2Vec(sentences=tokenized_ingredients, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
def get_vector(ingredients, model):
    vectors = [model.wv[word] for word in ingredients if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
# Compute recipe vectors
df['Vector'] = df['Clean_Ingredients'].apply(lambda x: get_vector(x.split(), word2vec_model))

In [None]:
def recommend_recipes(user_ingredients, df, model, top_n=5):
    user_vector = get_vector(user_ingredients, model)
    recipe_vectors = np.vstack(df['Vector'])
    # Compute Cosine Similarity
    similarities = cosine_similarity([user_vector], recipe_vectors)[0]
    # Rank recipes based on similarity scores
    df['Similarity'] = similarities
    top_recipes = df.sort_values(by="Similarity", ascending=False).head(top_n)
    return top_recipes[['Title', 'Ingredients', 'Instructions', 'Similarity']]



In [None]:
user_input = ["bread", "butter","chicken","apple","onion","ground pepper"]
recommendations = recommend_recipes(user_input, df, word2vec_model)
print(recommendations)