In [114]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [115]:
# Load the dataset
raw_data = pd.read_csv('./data/RAW_recipes.csv')

# Load the numerical dataset
pp_data = pd.read_csv('./data/PP_recipes.csv')

In [116]:
# Randomly sample a subset of the preprocessed data to reduce size
sampled_pp_data = pp_data.sample(n=1000, random_state=42) 

# Check the first few rows of the relevant columns
print(sampled_pp_data[['ingredient_tokens', 'steps_tokens', 'id']].head())

                                        ingredient_tokens  \
124475  [[589, 260, 4001, 17918], [1092, 2507, 6444], ...   
57837   [[1301, 11007], [978, 21453], [2056, 4178, 113...   
19808   [[10837], [13179], [6953], [31578], [6812], [2...   
142947  [[17869, 24176], [21453], [38139], [25905], [2...   
152996  [[22102, 562, 3905, 260, 19093, 7056], [8780],...   

                                             steps_tokens      id  
124475  [40480, 40482, 562, 31757, 40478, 40482, 729, ...  286292  
57837   [40480, 40482, 500, 246, 1220, 4835, 39587, 23...  138801  
19808   [40480, 40482, 14057, 10837, 488, 13179, 500, ...  408706  
142947  [40480, 40482, 500, 246, 1719, 4835, 39587, 24...  279202  
152996  [40480, 40482, 3612, 551, 22102, 504, 1874, 70...   20521  


In [117]:
# Convert the nested lists to strings for ingredients
ingredient_strings = [' '.join([' '.join(map(str, group)) for group in eval(ingredient)]) for ingredient in sampled_pp_data['ingredient_tokens']]

# Similarly, convert the nested lists to strings for steps
step_strings = [' '.join(map(str, eval(step))) for step in sampled_pp_data['steps_tokens']]

# Vectorize ingredients and steps separately
ingredient_vectorizer = CountVectorizer()
ingredient_features = ingredient_vectorizer.fit_transform(ingredient_strings)

step_vectorizer = CountVectorizer()
step_features = step_vectorizer.fit_transform(step_strings)

In [118]:
# Combine features
combined_features = hstack([ingredient_features, step_features])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(combined_features)

In [119]:
# Function to retrieve top similar recipes
def get_top_similar_recipes(similarity_matrix, query_index, n=10):
    query_distances = similarity_matrix[query_index]
    similar_indices = np.argsort(query_distances)[::-1][1:n+1]
    
    # Return the ID, name, and score for each similar recipe
    similar_recipes = []
    for i in similar_indices:
        similar_id = sampled_pp_data.iloc[i]['id']
        similar_name = id_to_name[similar_id]
        score = query_distances[i]
        similar_recipes.append((similar_id, similar_name, score))
    
    return similar_recipes

In [121]:
# Create a mapping from recipe IDs to names
id_to_name = dict(zip(raw_data['id'], raw_data['name']))

# Define the indices of the recipes you want to query
query_indices = [0, 1, 2]

# Identify and display the top 10 similar recipes for each query
for index in query_indices:
    similar_recipes = get_top_similar_recipes(similarity_matrix, index)
    
    # Get the name of the recipe corresponding to the current index
    recipe_id = sampled_pp_data.iloc[index]['id']
    recipe_name = id_to_name[recipe_id]  # Get the name using the ID

    print(f"Top similar recipes for '{recipe_name}' (ID: {recipe_id}):")
    
    for similar_id, similar_name, score in similar_recipes:
        print(f"  - {similar_name} (Similarity Score: {score:.4f})")
    print("\n")

Top similar recipes for 'apple blackberry pie' (ID: 286292):
  - pecan pie (Similarity Score: 0.8546)
  - blueberry overnight french toast (Similarity Score: 0.8407)
  - pineanna nut cake (Similarity Score: 0.8345)
  - chocolate drizzled shortbread (Similarity Score: 0.8295)
  - razz ma tazz bars (Similarity Score: 0.8256)
  - original nestle toll house chocolate chunk cookies (Similarity Score: 0.8210)
  - pappadeaux blackened oyster and shrimp fondeaux (Similarity Score: 0.8202)
  - raspberry pudding (Similarity Score: 0.8158)
  - very lemon cake with lush lemon frosting (Similarity Score: 0.8131)
  - no knead knot rolls (Similarity Score: 0.8119)


Top similar recipes for 'traci s black bean salsa' (ID: 138801):
  - treat the team milo cookies (Similarity Score: 0.7543)
  - triple lemon ripple cake (Similarity Score: 0.7313)
  - cherry mint iced tea (Similarity Score: 0.7313)
  - tea smoked chicken (Similarity Score: 0.7312)
  - garlic stuffed pork roast with glaze (Similarity Score