In [4]:
import pandas as pd

# Load your recipes dataframe (I'm using the 'recipes' one, not the merged one,
# as we only need to build the "search index" from the recipes)

# Linux
# path = '/home/aleksdraka/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2'

# macOS
path = '/Users/aleksandardrakaliyski/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2'

# --- 1. LOAD ALL THREE FILES ---
pp_recipes_df = pd.read_csv(f'{path}/PP_recipes.csv')
raw_recipes_df = pd.read_csv(f'{path}/RAW_recipes.csv')
ingredient_map = pd.read_pickle(f'{path}/ingr_map.pkl')

In [5]:
import pandas as pd
import ast  # For safely converting string-lists
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. MERGE RAW AND PP TO GET REAL NAMES ---
raw_names_df = raw_recipes_df[['id', 'name']]

# Merge the clean PP data with the raw 'name' column
recipes_df = pd.merge(pp_recipes_df, raw_names_df, on='id')

# --- 3. FIX THE INGREDIENT DECODER ---
def decode_ingredients(row_data, ingredient_map):
    """
    Converts a string-list of ingredient IDs (e.g., "[333, 90]")
    into a human-readable list (e.g., ['chicken breast', 'olive oil']).
    """
    try:
        id_list = ast.literal_eval(row_data) # e.g., [333, 90]

        # --- THIS IS THE FIX ---
        # Look up the integer ID directly.
        # We also add a check in case the map keys are strings.
        if isinstance(list(ingredient_map.keys())[0], str):
            # If map keys are strings (e.g., '333')
            return [ingredient_map.get(str(id_num), '?') for id_num in id_list]
        else:
            # If map keys are integers (e.g., 333)
            return [ingredient_map.get(id_num, '?') for id_num in id_list]

    except Exception as e:
        print("Could not find matching ids: ", e)
        return [] # Return empty if it fails

# Apply the fixed decoder
recipes_df['decoded_ingredients'] = recipes_df['ingredient_ids'].apply(
    lambda x: decode_ingredients(x, ingredient_map)
)


# --- 4. PREPARE DATA FOR VECTOR SEARCH (Same as before) ---
def convert_ids_to_string(row_data):
    try:
        id_list = ast.literal_eval(row_data)
        return " ".join([str(i) for i in id_list])
    except:
        return ""

recipes_df['ingredient_str'] = recipes_df['ingredient_ids'].apply(convert_ids_to_string)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(recipes_df['ingredient_str'])


# --- 5. UPDATED SEARCH FUNCTION (Returns the REAL name) ---
def find_similar_recipes(recipe_id, top_n=5):
    try:
        recipe_index = recipes_df[recipes_df['id'] == recipe_id].index[0]
    except IndexError:
        return f"Recipe ID {recipe_id} not found."

    recipe_vector = tfidf_matrix[recipe_index]
    sim_scores = cosine_similarity(recipe_vector, tfidf_matrix)[0]
    top_indices = sim_scores.argsort()[-top_n-1:-1][::-1]

    # --- MODIFIED RETURN ---
    # Return the 'name' (from RAW) and 'decoded_ingredients' (from our function)
    return recipes_df.iloc[top_indices][['id', 'name', 'decoded_ingredients']]

# --- 6. TEST IT! ---
# Let's search for a recipe (picking the 10th one in the list)
RECIPE_ID_TO_SEARCH = recipes_df['id'].iloc[10]

# Get the REAL name of the recipe we are searching for
search_recipe_name = recipes_df[recipes_df['id'] == RECIPE_ID_TO_SEARCH]['name'].values[0]

print(f"\n--- Finding recipes similar to: '{search_recipe_name}' (ID: {RECIPE_ID_TO_SEARCH}) ---")

similar_recipes = find_similar_recipes(RECIPE_ID_TO_SEARCH, 10)
similar_recipes


--- Finding recipes similar to: 'spicy cranberry chutney' (ID: 408927) ---


Unnamed: 0,id,name,decoded_ingredients
40628,94087,cranberry nut swirls,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?]"
42898,340227,cranberry sauce with dried cherries,"[?, ?, ?, ?, ?, ?, ?, ?]"
31923,194343,asparagus with mustard seed dressing,"[?, ?, ?, ?, ?]"
74182,143185,cranberry apple ginger sauce,"[?, ?, ?, ?, ?, ?, ?, ?]"
161898,395139,cranberry croquettes or fritters,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?]"
159138,40878,pungent cranberry orange sauce,"[?, ?, ?, ?, ?, ?]"
79348,257264,cranberry pear and ginger chutney,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?]"
98537,141583,merry morning muffins,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?]"
87053,4369,homemade cranberry relish,"[?, ?, ?, ?, ?, ?]"
131457,62193,basic cranberry sauce,"[?, ?, ?, ?]"
