In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/food-com-recipes-and-user-interactions/RAW_interactions.csv
/kaggle/input/food-com-recipes-and-user-interactions/ingr_map.pkl
/kaggle/input/food-com-recipes-and-user-interactions/PP_recipes.csv
/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_train.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_test.csv
/kaggle/input/food-com-recipes-and-user-interactions/PP_users.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_validation.csv


In [1]:
import pandas as pd
from collections import Counter
import ast
import re

# Function to safely evaluate the tags string to a list
def safe_eval_tags(tags_string):
    try:
        # Remove any extra brackets and split the string
        cleaned_string = re.sub(r'\]\[', ', ', tags_string.strip('[]'))
        return ast.literal_eval(f'[{cleaned_string}]')
    except:
        return []

# Simulated function to mimic reading from a dataset with limited rows
def read_limited_csv(file_path, max_rows=10000, chunksize=1000):
    rows_read = 0
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        if rows_read + len(chunk) > max_rows:
            yield chunk.iloc[:max_rows - rows_read]
            break
        yield chunk
        rows_read += len(chunk)

# Initialize a global tag counter
global_tag_counter = Counter()

# Process the data in chunks
chunk_number = 0
max_rows = 10000
for chunk in read_limited_csv('/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv', max_rows=max_rows):
    chunk_number += 1

    # Initialize a tag counter for this chunk
    chunk_tag_counter = Counter()

    # Process tags in this chunk
    for tags in chunk['tags']:
        tag_list = safe_eval_tags(tags)
        chunk_tag_counter.update(tag_list)

    # Update global counter
    global_tag_counter.update(chunk_tag_counter)

    # Print chunk-level results
    print(f"\nChunk {chunk_number}:")
    print(f"Total Recipes in Chunk: {len(chunk)}")
    print("Top 5 Tags in Chunk:")
    for tag, count in chunk_tag_counter.most_common(5):
        print(f"{tag}: {count}")

# Get the total number of recipes processed
total_recipes = max_rows

# Print global results
print("\nGlobal Tag Analysis:")
print(f"Total unique tags: {len(global_tag_counter)}")
print("Top 20 tags across all processed rows:")
for tag, count in global_tag_counter.most_common(20):
    percentage = (count / total_recipes) * 100
    print(f"{tag}: {count} ({percentage:.2f}%)")

# Least common tags
print("\nBottom 20 least common tags:")
for tag, count in global_tag_counter.most_common()[-20:]:
    percentage = (count / total_recipes) * 100
    print(f"{tag}: {count} ({percentage:.2f}%)")

# Tags appearing in more than 10% of recipes
common_tags = [tag for tag, count in global_tag_counter.items() if (count / total_recipes) > 0.1]
print(f"\nTags that appear in more than 10% of recipes: {common_tags}")

# Tags appearing in less than 1% of recipes
rare_tags = [tag for tag, count in global_tag_counter.items() if (count / total_recipes) < 0.01]
print(f"\nNumber of tags that appear in less than 1% of recipes: {len(rare_tags)}")



Chunk 1:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 993
time-to-make: 965
course: 961
dietary: 711
main-ingredient: 674

Chunk 2:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 991
time-to-make: 954
course: 950
dietary: 709
main-ingredient: 676

Chunk 3:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 994
time-to-make: 970
course: 931
dietary: 723
main-ingredient: 696

Chunk 4:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 998
time-to-make: 981
course: 953
dietary: 692
main-ingredient: 643

Chunk 5:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 996
time-to-make: 975
course: 954
main-ingredient: 679
dietary: 671

Chunk 6:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 995
time-to-make: 982
course: 934
dietary: 719
main-ingredient: 648

Chunk 7:
Total Recipes in Chunk: 1000
Top 5 Tags in Chunk:
preparation: 999
time-to-make: 983
course: 973
main-ingredient: 856
fruit: 769

Chunk 8:
Total

In [11]:
import re

# Refine the filtering logic to exclude non-time-related tags
def is_time_related(tag):
    # Match tags that explicitly include time units like minutes, hours, or specific patterns like "1-day-or-more"
    time_patterns = [
        r"\b\d+-minutes?\b",  # Matches "15-minutes", "30-minutes", etc.
        r"\b\d+-hours?\b",    # Matches "4-hours", "1-hour", etc.
        r"\b\d+-day(?:s)?-or-more\b",  # Matches "1-day-or-more"
        r"\b\d+-minutes-or-less\b",  # Matches "30-minutes-or-less", "15-minutes-or-less", etc.
        r"\b\d+-hours-or-less\b",    # Matches "4-hours-or-less"
    ]
    return any(re.search(pattern, tag) for pattern in time_patterns)

# Apply the refined logic
time_related_tags = {tag: count for tag, count in global_tag_counter.items() if is_time_related(tag)}

# Print the refined time-related tags
print("Refined Time-related tags:")
for tag, count in sorted(time_related_tags.items(), key=lambda x: x[1], reverse=True):
    print(f"{tag}: {count}")

# Display the top 10 most common refined time-related tags
print("\nTop 10 refined most common time-related tags:")
for tag, count in sorted(time_related_tags.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{tag}: {count}")


Refined Time-related tags:
60-minutes-or-less: 2913
30-minutes-or-less: 2195
4-hours-or-less: 2195
15-minutes-or-less: 2172
1-day-or-more: 113

Top 10 refined most common time-related tags:
60-minutes-or-less: 2913
30-minutes-or-less: 2195
4-hours-or-less: 2195
15-minutes-or-less: 2172
1-day-or-more: 113


In [12]:
# # Categorize tags (this is a manual process, here's a start)
# time_of_day_tags = {'breakfast', 'brunch', 'lunch', 'dinner', 'snack'}
# ingredient_tags = {'fruits', 'vegetables', 'meat', 'poultry', 'seafood', 'dairy'}
# preparation_tags = {'easy', 'quick', '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less'}
# occasion_tags = {'christmas', 'thanksgiving', 'halloween', 'easter', 'holiday', 'summer', 'winter', 'fall', 'spring'}

# # Function to check if a recipe has tags from a specific category
# def has_tag_from_category(tags, category_tags):
#     return bool(set(eval(tags)) & category_tags)

# # Add boolean columns for each category
# df['is_breakfast'] = df['tags'].apply(lambda x: has_tag_from_category(x, {'breakfast'}))
# df['is_lunch'] = df['tags'].apply(lambda x: has_tag_from_category(x, {'lunch'}))
# df['is_dinner'] = df['tags'].apply(lambda x: has_tag_from_category(x, {'dinner'}))
# df['is_easy'] = df['tags'].apply(lambda x: has_tag_from_category(x, {'easy'}))
# df['is_quick'] = df['tags'].apply(lambda x: has_tag_from_category(x, preparation_tags))
# df['is_holiday'] = df['tags'].apply(lambda x: has_tag_from_category(x, occasion_tags))
# df['has_fruits'] = df['tags'].apply(lambda x: has_tag_from_category(x, {'fruits'}))

# print(df[['name', 'is_breakfast', 'is_lunch', 'is_dinner', 'is_easy', 'is_quick', 'is_holiday', 'has_fruits']].head())

## Aim is to utilize the tags for content based similarity based on tags maybe levraging a little bit of NLP towards the end
- first goal : get results on RAW data
- second goal : pre-process data group them into more better / informative tags

In [13]:
import pandas as pd
import ast
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Define a function to extract time-related tags
def extract_time_tags(tags):
    time_patterns = [
        r"\b\d+-minutes?\b",         # Matches "15-minutes", "30-minutes", etc.
        r"\b\d+-hours?\b",           # Matches "4-hours", "1-hour", etc.
        r"\b\d+-day(?:s)?-or-more\b",  # Matches "1-day-or-more"
        r"\b\d+-minutes-or-less\b",  # Matches "30-minutes-or-less", "15-minutes-or-less", etc.
        r"\b\d+-hours-or-less\b"     # Matches "4-hours-or-less"
    ]
    try:
        # Convert string to list
        tag_list = ast.literal_eval(tags)
        # Filter tags matching the time patterns
        return [tag for tag in tag_list if any(re.search(pattern, tag) for pattern in time_patterns)]
    except:
        return []

# Read a sample of the dataset
file_path = '/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv'
chunk_size = 5000  # Define chunk size for processing
sample_frac = 0.1  # Fraction of data to sample

# Process the dataset in chunks to sample
dfs = []
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    sampled_chunk = chunk.sample(frac=sample_frac, random_state=42)
    dfs.append(sampled_chunk)

# Combine sampled chunks
recipes_df = pd.concat(dfs)

# Extract time-related tags into a new column
recipes_df['time_tags'] = recipes_df['tags'].apply(extract_time_tags)

# Drop rows with no time tags
recipes_df = recipes_df[recipes_df['time_tags'].apply(len) > 0]

# Convert time_tags to a string format for vectorization
recipes_df['time_tags_str'] = recipes_df['time_tags'].apply(lambda tags: ' '.join(tags))

# Use CountVectorizer to create a binary matrix for the time tags
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(' '))
time_tags_matrix = vectorizer.fit_transform(recipes_df['time_tags_str'])

# Compute cosine similarity on the matrix
similarity_matrix = cosine_similarity(time_tags_matrix)

# Convert the similarity matrix to a DataFrame for easy lookup
similarity_df = pd.DataFrame(similarity_matrix, index=recipes_df.index, columns=recipes_df.index)

# Example: Find top 3 similar recipes for a given recipe index
def get_similar_recipes(recipe_index, top_n=3):
    # Sort recipes by similarity score
    similar_scores = similarity_df[recipe_index].sort_values(ascending=False)
    # Exclude the recipe itself and return top N recommendations
    similar_recipes = similar_scores.drop(recipe_index).head(top_n)
    return similar_recipes

# Test the recommendation system with a sample recipe
recipe_index = recipes_df.index[0]  # Select the first recipe index
recommended_recipes = get_similar_recipes(recipe_index, top_n=3)

# Print recommendations
print(f"Top {len(recommended_recipes)} similar recipes for Recipe {recipe_index} based on time tags:")
print(recommended_recipes)

# Save the filtered and sampled dataframe for future use
recipes_df.to_csv('sampled_recipes_with_time_tags.csv', index=False)




Top 3 similar recipes for Recipe 1501 based on time tags:
110177    1.0
114392    1.0
112509    1.0
Name: 1501, dtype: float64


In [15]:
# Access specific rows by index
rows_of_interest = recipes_df.loc[[110177, 114392, 112509]]
rows_of_interest


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,time_tags,time_tags_str
110177,i can t wait for chole,25632,35,37636,2002-04-18,"['60-minutes-or-less', 'time-to-make', 'course...","[300.0, 4.0, 16.0, 26.0, 24.0, 1.0, 19.0]",11,"['in large saucepan over medium heat , heat cu...","a nice spicy indian chickpea recipe. i know, i...","['onions', 'fresh ginger', 'garlic', 'cumin se...",14,[60-minutes-or-less],60-minutes-or-less
114392,japanese crispy fried chicken kara age,494787,40,2678032,2013-02-05,"['lactose', '60-minutes-or-less', 'time-to-mak...","[397.4, 26.0, 6.0, 35.0, 47.0, 24.0, 11.0]",7,"['cut chicken in bite size pieces', 'in a plas...",this is a classic japanese kara-age recipe. ch...,"['boneless chicken thighs', 'ginger', 'clove',...",8,[60-minutes-or-less],60-minutes-or-less
112509,italian christmas cookies filled,420915,60,1559190,2010-04-19,"['60-minutes-or-less', 'time-to-make', 'course...","[200.6, 13.0, 39.0, 0.0, 4.0, 9.0, 9.0]",12,"['sift together flours and salt', 'cut shorten...",these are cookies my grandmother used to make ...,"['pastry flour', 'flour', 'salt', 'shortening'...",10,[60-minutes-or-less],60-minutes-or-less


In [17]:
recipes_df.iloc[0]

name                                                 49er flapjacks
id                                                           400885
minutes                                                          35
contributor_id                                              1396198
submitted                                                2009-11-23
tags              ['60-minutes-or-less', 'time-to-make', 'course...
nutrition                [287.6, 24.0, 29.0, 27.0, 24.0, 42.0, 8.0]
n_steps                                                          18
steps             ['the night before your pancake meal , in smal...
description       almost a perfect reproduction of the original ...
ingredients       ['dry yeast', 'milk', 'butter', 'eggs', 'sugar...
n_ingredients                                                     8
time_tags                                      [60-minutes-or-less]
time_tags_str                                    60-minutes-or-less
Name: 1501, dtype: object

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Function to tokenize ingredients
def tokenize_ingredients(ingredients):
    # Flatten ingredient list into a single string
    ingredient_str = ' '.join(ingredients).lower()
    return ingredient_str.split()  # Tokenize into words (no stopword removal)

# Apply tokenization to the ingredients column
recipes_df['processed_ingredients'] = recipes_df['ingredients'].apply(
    lambda x: tokenize_ingredients(ast.literal_eval(x))
)

# Combine time_tags and processed_ingredients
def combine_features(row):
    time_tags_str = ' '.join(row['time_tags'])
    ingredients_str = ' '.join(row['processed_ingredients'])
    return f"{time_tags_str} {ingredients_str}"

recipes_df['combined_features'] = recipes_df.apply(combine_features, axis=1)

# Use CountVectorizer to vectorize the combined_features column
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(' '))
combined_features_matrix = vectorizer.fit_transform(recipes_df['combined_features'])

# Compute cosine similarity on the combined feature matrix
combined_similarity_matrix = cosine_similarity(combined_features_matrix)

# Convert the similarity matrix to a DataFrame for easy lookup
combined_similarity_df = pd.DataFrame(combined_similarity_matrix, index=recipes_df.index, columns=recipes_df.index)

# Function to get top N similar recipes
def get_similar_recipes(recipe_index, top_n=25):
    # Sort recipes by similarity score
    similar_scores = combined_similarity_df[recipe_index].sort_values(ascending=False)
    # Exclude the recipe itself and return top N recommendations
    similar_recipes = similar_scores.drop(recipe_index).head(top_n)
    return similar_recipes

# Test the recommendation system with a sample recipe
recipe_index = recipes_df.index[0]  # Replace with your recipe ID
recommended_recipes = get_similar_recipes(recipe_index, top_n=25)

# Print recommendations
print(f"Top {len(recommended_recipes)} similar recipes for Recipe {recipes_df.iloc[recipe_index]['name']} (Index: {recipe_index}):")
print(recommended_recipes)

# Access details of recommended recipes
print("\nDetails of recommended recipes:")
print(recipes_df.loc[recommended_recipes.index, ['name', 'ingredients', 'time_tags']])





Top 25 similar recipes for Recipe baked shrimp toasts (Index: 1501):
13907     0.782624
139758    0.737865
173899    0.717137
123625    0.711512
128456    0.700000
94923     0.700000
226020    0.700000
28106     0.700000
110691    0.676123
28957     0.676123
151907    0.676123
143181    0.670820
187417    0.667424
48673     0.667424
52225     0.667424
22232     0.667424
102907    0.667424
67483     0.667424
69415     0.667424
212594    0.667424
131973    0.653197
22413     0.653197
199771    0.652929
201647    0.652929
220468    0.652929
Name: 1501, dtype: float64

Details of recommended recipes:
                                                   name  \
13907                 baked cinnamon sugar french toast   
139758          mother s one hour rolls  mors tvebakker   
173899                               rich baked custard   
123625                               light brioche buns   
128456                                        magic pie   
94923                        grandma chin 