In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from rapidfuzz import process, fuzz
from sklearn.preprocessing import MinMaxScaler
import ast
import re
import os
import nltk

In [2]:
# Env variable NLTK
nltk_data_path = os.getenv('NLTK_DATA')

if nltk_data_path:
    nltk.data.path.append(nltk_data_path)
else:
    print("⚠️ NLTK_DATA environment variable not set. Using default paths.")

from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

⚠️ NLTK_DATA environment variable not set. Using default paths.


In [3]:
pd.set_option('display.max_rows', None)     
pd.set_option('display.max_columns', None)   

In [4]:
data_dir = "../../data"

In [5]:
data_dir = "../../data"

interactions = pd.read_csv(f"{data_dir}/RAW_interactions.csv")
recipes = pd.read_csv(f"{data_dir}/RAW_recipes.csv")
pp_users = pd.read_csv(f"{data_dir}/PP_users.csv") #optional
pp_recipes = pd.read_csv(f"{data_dir}/PP_recipes.csv")  #optional

# Cleaning datasets 

In [6]:
def check_missing_values(dataframe, df_name="DataFrame"):
    """
    Check missing values in a dataframe and calculate the percentage of missing values
    """
    
    size = dataframe.size
    missing_values = dataframe.isnull().sum()

    # Calculate missing percentage for each feature
    missing_percentage = (missing_values / len(dataframe)) * 100
    
    missing_summary = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Missing Percentage': missing_percentage.values.round(2)
    })

    missing_summary = missing_summary.sort_values('Missing Percentage', ascending=False)

    total_missing = missing_values.sum()
    total_missing_percentage = (total_missing / size) * 100

    print(f"\n--- {df_name} Missing Values Analysis ---")
    print(f"Total cells: {size}")
    print(f"Total missing values: {total_missing} ({total_missing_percentage:.2f}%)")
    print("\nColumn-wise missing values:")
    display(missing_summary)

    missing_columns = missing_summary[missing_summary['Missing Values'] > 0]
    if len(missing_columns) > 0:
        print("\nColumns with missing values:")
        display(missing_columns)
    else:
        print("\nNo missing values found in any column.")

In [7]:
check_missing_values(interactions, "Interactions")
check_missing_values(recipes, "Recipes")
check_missing_values(pp_users, "Preprocessed Users")
check_missing_values(pp_recipes, "Preprocessed Recipes")


--- Interactions Missing Values Analysis ---
Total cells: 5661835
Total missing values: 169 (0.00%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
4,review,169,0.01
0,user_id,0,0.0
1,recipe_id,0,0.0
2,date,0,0.0
3,rating,0,0.0



Columns with missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
4,review,169,0.01



--- Recipes Missing Values Analysis ---
Total cells: 2779644
Total missing values: 4980 (0.18%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
9,description,4979,2.15
0,name,1,0.0
1,id,0,0.0
2,minutes,0,0.0
3,contributor_id,0,0.0
4,submitted,0,0.0
5,tags,0,0.0
6,nutrition,0,0.0
7,n_steps,0,0.0
8,steps,0,0.0



Columns with missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
9,description,4979,2.15
0,name,1,0.0



--- Preprocessed Users Missing Values Analysis ---
Total cells: 150456
Total missing values: 0 (0.00%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
0,u,0,0.0
1,techniques,0,0.0
2,items,0,0.0
3,n_items,0,0.0
4,ratings,0,0.0
5,n_ratings,0,0.0



No missing values found in any column.

--- Preprocessed Recipes Missing Values Analysis ---
Total cells: 1426120
Total missing values: 0 (0.00%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
0,id,0,0.0
1,i,0,0.0
2,name_tokens,0,0.0
3,ingredient_tokens,0,0.0
4,steps_tokens,0,0.0
5,techniques,0,0.0
6,calorie_level,0,0.0
7,ingredient_ids,0,0.0



No missing values found in any column.


## Cleaning missing values

In [8]:
interactions['review'] = interactions['review'].fillna("No review provided")
recipes['description'] = recipes['description'].fillna("No description available")
recipes = recipes.dropna(subset=['name'])

In [9]:
check_missing_values(interactions, "Interactions (Clean)")
check_missing_values(recipes, "Recipes (Clean)")


--- Interactions (Clean) Missing Values Analysis ---
Total cells: 5661835
Total missing values: 0 (0.00%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
0,user_id,0,0.0
1,recipe_id,0,0.0
2,date,0,0.0
3,rating,0,0.0
4,review,0,0.0



No missing values found in any column.

--- Recipes (Clean) Missing Values Analysis ---
Total cells: 2779632
Total missing values: 0 (0.00%)

Column-wise missing values:


Unnamed: 0,Column,Missing Values,Missing Percentage
0,name,0,0.0
1,id,0,0.0
2,minutes,0,0.0
3,contributor_id,0,0.0
4,submitted,0,0.0
5,tags,0,0.0
6,nutrition,0,0.0
7,n_steps,0,0.0
8,steps,0,0.0
9,description,0,0.0



No missing values found in any column.


## Recipe Dataset

In [10]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231636 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231636 non-null  int64 
 2   minutes         231636 non-null  int64 
 3   contributor_id  231636 non-null  int64 
 4   submitted       231636 non-null  object
 5   tags            231636 non-null  object
 6   nutrition       231636 non-null  object
 7   n_steps         231636 non-null  int64 
 8   steps           231636 non-null  object
 9   description     231636 non-null  object
 10  ingredients     231636 non-null  object
 11  n_ingredients   231636 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 23.0+ MB


In [11]:
#Convert to stringified lists
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
recipes['tags'] = recipes['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
recipes['nutrition'] = recipes['nutrition'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [12]:
recipes['submitted'] = pd.to_datetime(recipes['submitted'])

### Extract nutrition features from list values --> Food.com Kaggle for reference 

In [13]:
def split_nutrition(nutrition_list):
    """
        Extract nutrition from list of values
    """
    return pd.Series({
        'calories': nutrition_list[0],
        'total_fat_pdv': nutrition_list[1],
        'sugar_pdv': nutrition_list[2],
        'sodium_pdv': nutrition_list[3],
        'protein_pdv': nutrition_list[4],
        'saturated_fat_pdv': nutrition_list[5],
        'carbs_pdv': nutrition_list[6]
    })

In [14]:
nutrition_df = recipes['nutrition'].apply(split_nutrition)
recipes = pd.concat([recipes, nutrition_df], axis=1)

### Normalize

In [15]:
# Scale features
scaler = MinMaxScaler()
pdv_cols = ['total_fat_pdv', 'sugar_pdv', 'sodium_pdv', 'protein_pdv', 'saturated_fat_pdv', 'carbs_pdv']
recipes[[f'{col}_norm' for col in pdv_cols]] = scaler.fit_transform(recipes[pdv_cols])
recipes['minutes_normalized'] = scaler.fit_transform(recipes[['minutes']])
recipes['n_steps_norm'] = scaler.fit_transform(recipes[['n_steps']])

### Normalize ingredients

We clean up each ingredient name in your dataset by:

- Lowercasing it
- Removing punctuation
- Removing numbers
- Lemmatizing to fix plural/singular differences (e.g., "onions" → "onion")
- Remove prep words: 'chopped', 'diced', 'sliced', 'minced', 'shredded', 'crushed', 'grated',
    'ground', 'peeled', 'fresh', 'organic', 'frozen', 'raw'

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
prep_words = [
    'chopped', 'diced', 'sliced', 'minced', 'shredded', 'crushed', 'grated',
    'ground', 'peeled', 'fresh', 'organic', 'frozen', 'raw', 'whole'
]

In [18]:
def clean_and_normalize(ingredient):
    ingredient = ingredient.lower().strip()
    ingredient = re.sub(r'[^\w\s]', '', ingredient)
    ingredient = re.sub(r'\d+', '', ingredient)
    words = ingredient.split()
    words = [word for word in words if word not in prep_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    normalized_ingredient = " ".join(lemmatized_words)
    return normalized_ingredient



In [19]:
recipes['cleaned_ingredients'] = recipes['ingredients'].apply(
    lambda ing_list: [clean_and_normalize(ing) for ing in ing_list])


In [20]:
recipes[['name', 'ingredients','cleaned_ingredients']].head(50)

Unnamed: 0,name,ingredients,cleaned_ingredients
0,arriba baked winter squash mexican style,"[winter squash, mexican seasoning, mixed spice...","[winter squash, mexican seasoning, mixed spice..."
1,a bit different breakfast pizza,"[prepared pizza crust, sausage patty, eggs, mi...","[prepared pizza crust, sausage patty, egg, mil..."
2,all in the kitchen chili,"[ground beef, yellow onions, diced tomatoes, t...","[beef, yellow onion, tomato, tomato paste, tom..."
3,alouette potatoes,"[spreadable cheese with garlic and herbs, new ...","[spreadable cheese with garlic and herb, new p..."
4,amish tomato ketchup for canning,"[tomato juice, apple cider vinegar, sugar, sal...","[tomato juice, apple cider vinegar, sugar, sal..."
5,apple a day milk shake,"[milk, vanilla ice cream, frozen apple juice c...","[milk, vanilla ice cream, apple juice concentr..."
6,aww marinated olives,"[fennel seeds, green olives, ripe olives, garl...","[fennel seed, green olive, ripe olive, garlic,..."
7,backyard style barbecued ribs,"[pork spareribs, soy sauce, fresh garlic, fres...","[pork sparerib, soy sauce, garlic, ginger, chi..."
8,bananas 4 ice cream pie,"[chocolate sandwich style cookies, chocolate s...","[chocolate sandwich style cooky, chocolate syr..."
9,beat this banana bread,"[sugar, unsalted butter, bananas, eggs, fresh ...","[sugar, unsalted butter, banana, egg, lemon ju..."


In [21]:
#ONLY IF WE HAVE TIME, FOR NOW, IT WOULD COMPLICATE THE PROJECT
def find_similar(base, ingredient_list, threshold=85, limit=10, max_words=3):
    """
    Find similar ingredient names to a base ingredient.
    """
    results = process.extract(base, ingredient_list, scorer=fuzz.token_set_ratio, limit=limit)
    matches = [r for r in results if r[1] >= threshold]
    
    filtered = []
    
    for match in matches:
        ingredient = match[0]
        word_count = len(ingredient.split())
        
        if word_count <= max_words:
            filtered.append(match)
    
    return filtered


### Cleaning tags

In [22]:
all_tags = [tag for tags in recipes['tags'] for tag in tags]
tag_counts = Counter(all_tags)
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

60-minutes-or-less: 69990
time-to-make: 225325
course: 218147
main-ingredient: 170446
cuisine: 91165
preparation: 230545
occasion: 114145
north-american: 48479
side-dishes: 26902
vegetables: 53814
mexican: 6694
easy: 126061
fall: 7754
holiday-event: 34920
vegetarian: 35651
winter: 7654
dietary: 165090
christmas: 8145
seasonal: 21933
squash: 3696
30-minutes-or-less: 55077
breakfast: 13655
main-dish: 71786
pork: 12692
american: 31179
oven: 31180
kid-friendly: 27074
pizza: 1505
northeastern-united-states: 2566
meat: 56042
equipment: 70436
chili: 1693
crock-pot-slow-cooker: 6608
4-hours-or-less: 49497
eggs-dairy: 30142
potatoes: 10293
dinner-party: 37561
easter: 2368
cheese: 15147
stove-top: 22095
new-years: 2487
thanksgiving: 4357
independence-day: 2359
st-patricks-day: 828
valentines-day: 2680
inexpensive: 32618
brunch: 18927
superbowl: 1865
presentation: 24470
served-hot: 12061
weeknight: 20948
canning: 1013
condiments-etc: 11874
heirloom-historical: 4575
amish-mennonite: 214
number-of-

In [23]:
junk_tags = [
    'low-in-something', 'high-in-something', 'dietary', 'preparation', 'course',
    'occasion', 'equipment', 'main-ingredient', 'presentation', 'technique'
]

recipes['tags'] = recipes['tags'].apply(lambda tags: [tag for tag in tags if tag not in junk_tags])


In [24]:
# Lowercase, strip whitespace, and replace hyphens with underscores
recipes['tags'] = recipes['tags'].apply(
    lambda tags: [tag.lower().replace('-', '_').strip() for tag in tags]
)

## Interaction Dataset

In [25]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132367 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [26]:
interactions['date'] = pd.to_datetime(interactions['date'])

In [27]:
interactions = interactions.sort_values(by='date', ascending=False)

#Keep the latest interaction (rating/review) for each user-recipe pair
interactions_latest = interactions.drop_duplicates(subset=['user_id', 'recipe_id'])

In [28]:
interactions['rating'] = interactions['rating'].clip(lower=1, upper=5)

Perhaps, it could be useful to review later do a sentimental analysis on the review

## pp_users dataset

In [29]:
pp_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25076 entries, 0 to 25075
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   u           25076 non-null  int64 
 1   techniques  25076 non-null  object
 2   items       25076 non-null  object
 3   n_items     25076 non-null  int64 
 4   ratings     25076 non-null  object
 5   n_ratings   25076 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 1.1+ MB


In [30]:
#Convert to stringified lists
pp_users['techniques'] = pp_users['techniques'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_users['items'] = pp_users['items'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_users['ratings'] = pp_users['ratings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [31]:
pp_users['techniques_array'] = pp_users['techniques'].apply(lambda x: np.array(x))

In [32]:
#confirm consistency of the data
assert all(pp_users['n_items'] == pp_users['items'].apply(len))
assert all(pp_users['n_ratings'] == pp_users['ratings'].apply(len))

## pp_recipe dataset

In [33]:
pp_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178265 entries, 0 to 178264
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 178265 non-null  int64 
 1   i                  178265 non-null  int64 
 2   name_tokens        178265 non-null  object
 3   ingredient_tokens  178265 non-null  object
 4   steps_tokens       178265 non-null  object
 5   techniques         178265 non-null  object
 6   calorie_level      178265 non-null  int64 
 7   ingredient_ids     178265 non-null  object
dtypes: int64(3), object(5)
memory usage: 10.9+ MB


In [34]:
assert pp_recipes['id'].is_unique

In [35]:
#Convert to stringified lists
pp_recipes['name_tokens'] = pp_recipes['name_tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_recipes['ingredient_tokens'] = pp_recipes['ingredient_tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_recipes['steps_tokens'] = pp_recipes['steps_tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_recipes['techniques'] = pp_recipes['techniques'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
pp_recipes['ingredient_ids'] = pp_recipes['ingredient_ids'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Export clean data to pkl file 

In [36]:
recipes.to_pickle(f'{data_dir}/recipes_clean.pkl')
interactions.to_pickle(f'{data_dir}/interactions_clean.pkl')
pp_users.to_pickle(f'{data_dir}/pp_users_clean.pkl')
pp_recipes.to_pickle(f'{data_dir}/pp_recipes_clean.pkl')