In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from textblob import TextBlob
from collections import Counter

In [2]:
data_dir = "../../data"

In [3]:
interactions = pd.read_pickle(f"{data_dir}/interactions_clean.pkl")
recipes = pd.read_pickle(f"{data_dir}/recipes_clean.pkl")
pp_users = pd.read_pickle(f"{data_dir}/PP_users_clean.pkl") 
pp_recipes = pd.read_pickle(f"{data_dir}/PP_recipes_clean.pkl") 

## Feature Engineering for interactions dataset

In [4]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1132367 entries, 982325 to 317939
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   user_id    1132367 non-null  int64         
 1   recipe_id  1132367 non-null  int64         
 2   date       1132367 non-null  datetime64[ns]
 3   rating     1132367 non-null  int64         
 4   review     1132367 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 51.8+ MB


In [5]:
interactions['year'] = interactions['date'].dt.year
interactions['month'] = interactions['date'].dt.month
interactions['day_of_week'] = interactions['date'].dt.dayofweek

In [6]:
today = datetime.today()
interactions['days_since'] = (today - interactions['date']).dt.days

In [7]:
#how many interactions each user has
user_interactions_count = interactions.groupby('user_id')['recipe_id'].count().reset_index()
user_interactions_count.rename(columns={'recipe_id': 'interaction_count'}, inplace=True)

interactions = interactions.merge(user_interactions_count, on='user_id', how='left')

In [8]:
scaler = MinMaxScaler()
interactions['rating_scaled'] = scaler.fit_transform(interactions[['rating']])

In [9]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   user_id            1132367 non-null  int64         
 1   recipe_id          1132367 non-null  int64         
 2   date               1132367 non-null  datetime64[ns]
 3   rating             1132367 non-null  int64         
 4   review             1132367 non-null  object        
 5   year               1132367 non-null  int32         
 6   month              1132367 non-null  int32         
 7   day_of_week        1132367 non-null  int32         
 8   days_since         1132367 non-null  int64         
 9   interaction_count  1132367 non-null  int64         
 10  rating_scaled      1132367 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(3), int64(5), object(1)
memory usage: 82.1+ MB


In [10]:
user_avg_rating = interactions.groupby('user_id')['rating_scaled'].mean().reset_index()
user_avg_rating.rename(columns={'rating_scaled': 'user_avg_rating_scaled'}, inplace=True)

interactions = interactions.merge(user_avg_rating, on='user_id', how='left')

Polarity: A measure of the sentiment expressed in the text. It indicates how positive or negative the review is.
Range:
    -1.0 → Very Negative
    0.0 → Neutral
    +1.0 → Very Positive

Subjectivity:A measure of how subjective or objective the text is. Subjective sentences express personal opinions, feelings, and judgments, while objective sentences are more factual and neutral.

Range:
    0.0 → Very Objective (factual information)
    1.0 → Very Subjective (opinion or emotion)

In [11]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 12 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   user_id                 1132367 non-null  int64         
 1   recipe_id               1132367 non-null  int64         
 2   date                    1132367 non-null  datetime64[ns]
 3   rating                  1132367 non-null  int64         
 4   review                  1132367 non-null  object        
 5   year                    1132367 non-null  int32         
 6   month                   1132367 non-null  int32         
 7   day_of_week             1132367 non-null  int32         
 8   days_since              1132367 non-null  int64         
 9   interaction_count       1132367 non-null  int64         
 10  rating_scaled           1132367 non-null  float64       
 11  user_avg_rating_scaled  1132367 non-null  float64       
dtypes: datetime64[

In [12]:
interactions['review'] = interactions['review'].str.lower()

In [13]:
#Capture user engagement
interactions['review_length'] = interactions['review'].apply(lambda x: len(x.split()) if pd.notna(x) else 0)

Polarity → approximates positive/negative review.
Subjectivity → tells whether review contains opinion or fact.

In [14]:
sentiment = interactions['review'].apply(
    lambda x: pd.Series({
        'polarity': 0 if pd.isna(x) else TextBlob(x).sentiment.polarity,
        'subjectivity': 0 if pd.isna(x) else TextBlob(x).sentiment.subjectivity
    })
)

interactions = interactions.join(sentiment)

In [15]:
interactions.to_pickle(f"{data_dir}/interactions_extended.pkl")

## Feature Engineering for recipes dataset

In [16]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231636 entries, 0 to 231636
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   name                    231636 non-null  object        
 1   id                      231636 non-null  int64         
 2   minutes                 231636 non-null  int64         
 3   contributor_id          231636 non-null  int64         
 4   submitted               231636 non-null  datetime64[ns]
 5   tags                    231636 non-null  object        
 6   nutrition               231636 non-null  object        
 7   n_steps                 231636 non-null  int64         
 8   steps                   231636 non-null  object        
 9   description             231636 non-null  object        
 10  ingredients             231636 non-null  object        
 11  n_ingredients           231636 non-null  int64         
 12  calories                231636 non-

In [17]:
recipes['tags'].unique

<bound method Series.unique of 0         [60_minutes_or_less, time_to_make, cuisine, no...
1         [30_minutes_or_less, time_to_make, cuisine, no...
2         [time_to_make, main_dish, chili, crock_pot_slo...
3         [60_minutes_or_less, time_to_make, side_dishes...
4         [weeknight, time_to_make, cuisine, north_ameri...
                                ...                        
231632    [ham, 60_minutes_or_less, time_to_make, cuisin...
231633    [15_minutes_or_less, time_to_make, for_large_g...
231634    [60_minutes_or_less, time_to_make, appetizers,...
231635    [30_minutes_or_less, time_to_make, for_large_g...
231636    [30_minutes_or_less, time_to_make, for_large_g...
Name: tags, Length: 231636, dtype: object>

In [18]:
# Extract useful time features (optional)
recipes['year'] = recipes['submitted'].dt.year
recipes['month'] = recipes['submitted'].dt.month
recipes['weekday'] = recipes['submitted'].dt.weekday

In [19]:
# Discretize preparation time into categories
recipes['time_bin'] = pd.cut(
    recipes['minutes'],
    bins=[0, 15, 30, 60, 120, recipes['minutes'].max()],
    labels=['super_quick', 'quick', 'medium', 'long', 'very_long']
)

In [20]:
# Recipe complexity manually
recipes['complexity'] = recipes['n_steps'] + recipes['n_ingredients']

In [21]:
# Normalize ingredients 
recipes['n_ingredients_norm'] = scaler.fit_transform(recipes[['n_ingredients']])

In [22]:
# dietary preferences tags
dietary_tags = [
    'vegan', 'vegetarian', 'pescatarian', 'flexitarian', 'raw', 'paleo', 'keto',
    'mediterranean', 'macrobiotic', 'zone_diet', 'carnivore_diet', 'alkaline_diet',
    'whole30', 'dash_diet', 'low_carb', 'high_protein', 'low_fat', 'fat_free',
    'clean_eating', 'plant_based',
    'diabetic_friendly', 'heart_healthy', 'renal_diet', 'anti_inflammatory',
    'low_fodmap', 'cholesterol_free', 'low_sodium', 'salt_free', 'weight_watchers',
    'pcos_friendly', 'gallbladder_friendly', 'thyroid_friendly',
    'gluten_free', 'wheat_free', 'nut_free', 'peanut_free', 'dairy_free',
    'lactose_free', 'soy_free', 'egg_free', 'sesame_free', 'corn_free',
    'shellfish_free', 'seafood_free', 'sulfite_free',
    'kosher', 'halal', 'jain', 'lacto_vegetarian', 'ovo_vegetarian'
]

In [23]:
for label in dietary_tags:
    column_name = label.lower()
    
    recipes[column_name] = recipes['tags'].apply(
        lambda tags: 1 if column_name in tags else 0
    )

In [24]:
for label in dietary_tags:
    print(f"{label}: {recipes[label].sum()} recipes flagged")

vegan: 10012 recipes flagged
vegetarian: 35651 recipes flagged
pescatarian: 0 recipes flagged
flexitarian: 0 recipes flagged
raw: 0 recipes flagged
paleo: 0 recipes flagged
keto: 0 recipes flagged
mediterranean: 0 recipes flagged
macrobiotic: 0 recipes flagged
zone_diet: 0 recipes flagged
carnivore_diet: 0 recipes flagged
alkaline_diet: 0 recipes flagged
whole30: 0 recipes flagged
dash_diet: 0 recipes flagged
low_carb: 42189 recipes flagged
high_protein: 7209 recipes flagged
low_fat: 22170 recipes flagged
fat_free: 0 recipes flagged
clean_eating: 0 recipes flagged
plant_based: 0 recipes flagged
diabetic_friendly: 0 recipes flagged
heart_healthy: 0 recipes flagged
renal_diet: 0 recipes flagged
anti_inflammatory: 0 recipes flagged
low_fodmap: 0 recipes flagged
cholesterol_free: 0 recipes flagged
low_sodium: 43348 recipes flagged
salt_free: 0 recipes flagged
weight_watchers: 0 recipes flagged
pcos_friendly: 0 recipes flagged
gallbladder_friendly: 0 recipes flagged
thyroid_friendly: 0 reci

In [25]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 231636 entries, 0 to 231636
Data columns (total 84 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   name                    231636 non-null  object        
 1   id                      231636 non-null  int64         
 2   minutes                 231636 non-null  int64         
 3   contributor_id          231636 non-null  int64         
 4   submitted               231636 non-null  datetime64[ns]
 5   tags                    231636 non-null  object        
 6   nutrition               231636 non-null  object        
 7   n_steps                 231636 non-null  int64         
 8   steps                   231636 non-null  object        
 9   description             231636 non-null  object        
 10  ingredients             231636 non-null  object        
 11  n_ingredients           231636 non-null  int64         
 12  calories                231636 non-

low-sodium : <= 5 FDA 5%
low-fat : <= 5  FDA 5%
sugar-free : 0  FDA 0%
low-carb : <=5   No FDA rule
low-protein : <=5 No FDA rule

In [26]:
recipes['salt_free'] = recipes['sodium_pdv'].apply(lambda x: 1 if x == 0 else 0)
recipes['low_sodium'] = recipes['sodium_pdv'].apply(lambda x: 1 if x < 5 else 0)
recipes['fat_free'] = recipes['total_fat_pdv'].apply(lambda x: 1 if x == 0 else 0)
recipes['low_fat'] = recipes['total_fat_pdv'].apply(lambda x: 1 if x < 5 else 0)
recipes['sugar_free'] = recipes['sugar_pdv'].apply(lambda x: 1 if x == 0 else 0)
recipes['low_sugar'] = recipes['sugar_pdv'].apply(lambda x: 1 if x < 5 else 0)

In [27]:
gluten_ingredients = [
    'wheat', 'flour', 'barley', 'rye', 'bread', 'pasta', 'semolina', 'spelt',
    'bulgur', 'couscous', 'farro', 'graham', 'triticale'
]
peanut_ingredients = [
    'peanuts', 'peanut butter', 'peanut oil'
]
tree_nuts_ingredients = [
    'almonds', 'walnuts', 'cashews', 'pecans', 'hazelnuts', 'pistachios',
    'macadamia nuts', 'brazil nuts'
]
dairy_ingredients = [
    'milk', 'cheese', 'butter', 'cream', 'yogurt', 'casein', 'whey', 'ghee'
]
egg_ingredients = [
    'egg', 'eggs', 'egg yolk', 'egg white', 'albumin'
]
soy_ingredients = [
    'soy', 'soybeans', 'soy flour', 'soy sauce', 'tofu', 'edamame', 'miso', 'tempeh'
]
sesame_ingredients = [
    'sesame', 'sesame seeds', 'tahini', 'sesame oil'
]
corn_ingredients = [
    'corn', 'corn flour', 'corn starch', 'corn syrup', 'masa', 'hominy', 'polenta'
]
shellfish_ingredients = [
    'shrimp', 'prawns', 'crab', 'lobster', 'clams', 'scallops', 'oysters', 'mussels'
]

In [28]:
def is_free_from(ingredients, exclusion_list):
    """
    Return 1 if recipe is free from all ingredients in exclusion_list.
    """
    # Make ingredients lowercase to ensure matching
    ingredients = [ing.lower() for ing in ingredients]
    
    return 0 if any(ex in ing for ex in exclusion_list for ing in ingredients) else 1

In [29]:
recipes['gluten_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, gluten_ingredients))
recipes['peanut_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, peanut_ingredients))
recipes['nut_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, tree_nuts_ingredients + peanut_ingredients))
recipes['dairy_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, dairy_ingredients))
recipes['egg_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, egg_ingredients))
recipes['soy_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, soy_ingredients))
recipes['sesame_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, sesame_ingredients))
recipes['corn_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, corn_ingredients))
recipes['shellfish_free'] = recipes['cleaned_ingredients'].apply(lambda x: is_free_from(x, shellfish_ingredients))

In [30]:
recipes.to_pickle(f"{data_dir}/recipes_extended.pkl")

## Feature Engineering for pp_users dataset

In [31]:
pp_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25076 entries, 0 to 25075
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   u                 25076 non-null  int64 
 1   techniques        25076 non-null  object
 2   items             25076 non-null  object
 3   n_items           25076 non-null  int64 
 4   ratings           25076 non-null  object
 5   n_ratings         25076 non-null  int64 
 6   techniques_array  25076 non-null  object
dtypes: int64(3), object(4)
memory usage: 1.3+ MB


In [32]:
pp_users.head(10)

Unnamed: 0,u,techniques,items,n_items,ratings,n_ratings,techniques_array
0,0,"[8, 0, 0, 5, 6, 0, 0, 1, 0, 9, 1, 0, 0, 0, 1, ...","[1118, 27680, 32541, 137353, 16428, 28815, 658...",31,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...",31,"[8, 0, 0, 5, 6, 0, 0, 1, 0, 9, 1, 0, 0, 0, 1, ..."
1,1,"[11, 0, 0, 2, 12, 0, 0, 0, 0, 14, 5, 0, 0, 0, ...","[122140, 77036, 156817, 76957, 68818, 155600, ...",39,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",39,"[11, 0, 0, 2, 12, 0, 0, 0, 0, 14, 5, 0, 0, 0, ..."
2,2,"[13, 0, 0, 7, 5, 0, 1, 2, 1, 11, 0, 1, 0, 0, 1...","[168054, 87218, 35731, 1, 20475, 9039, 124834,...",27,"[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...",27,"[13, 0, 0, 7, 5, 0, 1, 2, 1, 11, 0, 1, 0, 0, 1..."
3,3,"[498, 13, 4, 218, 376, 3, 2, 33, 16, 591, 10, ...","[163193, 156352, 102888, 19914, 169438, 55772,...",1513,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...",1513,"[498, 13, 4, 218, 376, 3, 2, 33, 16, 591, 10, ..."
4,4,"[161, 1, 1, 86, 93, 0, 0, 11, 2, 141, 0, 16, 0...","[72857, 38652, 160427, 55772, 119999, 141777, ...",376,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...",376,"[161, 1, 1, 86, 93, 0, 0, 11, 2, 141, 0, 16, 0..."
5,5,"[96, 2, 0, 38, 71, 0, 3, 9, 1, 90, 12, 13, 0, ...","[122026, 57553, 176588, 64777, 22746, 13097, 1...",290,"[4.0, 4.0, 1.0, 3.0, 1.0, 4.0, 3.0, 3.0, 3.0, ...",290,"[96, 2, 0, 38, 71, 0, 3, 9, 1, 90, 12, 13, 0, ..."
6,6,"[2, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ...","[64490, 3, 98141, 49558]",4,"[5.0, 5.0, 5.0, 5.0]",4,"[2, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ..."
7,7,"[6, 0, 0, 2, 1, 0, 0, 1, 0, 5, 1, 0, 0, 0, 0, ...","[12938, 43057, 84375, 105841, 62768, 155858, 1...",16,"[0.0, 5.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16,"[6, 0, 0, 2, 1, 0, 0, 1, 0, 5, 1, 0, 0, 0, 0, ..."
8,8,"[87, 1, 0, 33, 49, 0, 0, 8, 2, 95, 1, 4, 0, 1,...","[139822, 28774, 149235, 127227, 73843, 68663, ...",232,"[4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, ...",232,"[87, 1, 0, 33, 49, 0, 0, 8, 2, 95, 1, 4, 0, 1,..."
9,9,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ...","[116193, 3, 20238, 9967, 11115, 96404]",6,"[4.0, 5.0, 5.0, 3.0, 5.0, 5.0]",6,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ..."


In [33]:
#how generous or critical a user is
pp_users['avg_rating'] = pp_users['ratings'].apply(
    lambda x: np.mean(x) if isinstance(x, list) and len(x) > 0 else 0
)

In [34]:
pp_users['avg_rating_scaled'] = scaler.fit_transform(pp_users[['avg_rating']])

In [35]:
#Variability in rating, consistency vs. randon rates
pp_users['rating_std'] = pp_users['ratings'].apply(
    lambda x: np.std(x) if isinstance(x, list) and len(x) > 0 else 0
)

In [36]:
def interaction_category(n_items):
    if n_items < 10:
        return 'casual'
    elif n_items < 50:
        return 'regular'
    else:
        return 'power_user'


In [37]:
# User interaction level --> categorical feature
pp_users['interaction_level'] = pp_users['n_items'].apply(interaction_category)
pp_users['interaction_level'] = pp_users['interaction_level'].astype('category')

In [38]:
#Numb. of technique use by a user can indicate the mastery of cooking skill --> prefer more complex recipes
pp_users['technique_diversity'] = pp_users['techniques_array'].apply(
    lambda x: len(set(x)) if isinstance(x, list) else 0
)

Look for technique labels to map --> check this 

In [39]:
pp_users['most_common_technique'] = pp_users['techniques_array'].apply(
    lambda techniques: Counter(techniques).most_common(1)[0][0] 
    if isinstance(techniques, list) and len(techniques) > 0 
    else None
)

In [40]:
recipes_indexed = recipes.set_index('id')

### Dietary profile for the user given their recipe interactions

In [41]:
#Build dietary profile for a given user
def build_user_diet_profile(user_items, recipes_df, dietary_columns):
    # recipes_df is indexed by 'id'
    user_recipes = recipes_df.loc[recipes_df.index.intersection(user_items)]
    
    if user_recipes.empty:
        # No recipes found for this user, return zeros
        return pd.Series([0] * len(dietary_columns), index=dietary_columns)
    
    # Mean of dietary tags across the recipes
    return user_recipes[dietary_columns].mean()


In [42]:
diet_profiles = pp_users['items'].apply(
    lambda recipe_ids: build_user_diet_profile(recipe_ids, recipes_indexed, dietary_tags)
)

In [43]:
pp_users = pd.concat([pp_users, diet_profiles], axis=1)

In [44]:
pp_users.to_pickle(f'{data_dir}/pp_users_extended.pkl')