# Load Dataset

In [1]:
import pandas as pd
import numpy as np

# Load user profiles and food dataset
users_df = pd.read_csv('users_profile.csv')
food_df = pd.read_csv('food_dataset_with_price.csv')

In [2]:
users_df.head()

Unnamed: 0,user_id,age,weight,height,gender,bmi,activity,goal,budget,profile_type
0,U001,24,58,168,F,20.5,4,lose,45,Young Adult
1,U002,20,65,175,M,21.2,5,gain,50,Young Adult
2,U003,28,70,172,M,23.6,3,maintain,60,Young Adult
3,U004,22,52,162,F,19.8,2,gain,35,Young Adult
4,U005,26,82,170,F,28.3,2,lose,55,Young Adult


In [3]:
food_df.head()

Unnamed: 0,food_id,food_name,food_name_id,search_keyword,category,serving_size,serving_unit,typical_serving_g,serving_description,calories,...,fat_per_100g,protein_ratio,carb_ratio,fat_ratio,suitable_breakfast,suitable_lunch,suitable_dinner,suitable_snack,price_per_100g,display_price
0,3092,Egg,Telur,Egg,protein,100.0,g,50,1 butir,147.0,...,9.94,0.0856,0.0052,0.0676,1,0,0,0,3400,1700
1,33797,Fried Egg,Telur Goreng,Egg,protein,100.0,g,50,1 butir,201.0,...,15.31,0.0678,0.0044,0.0762,1,0,0,0,4050,2000
2,3094,Boiled Egg,Telur Rebus,Egg,protein,100.0,g,50,1 butir,155.0,...,10.61,0.0812,0.0072,0.0685,1,0,0,0,3400,1700
3,33793,Egg White,Putih Telur,Egg,protein,100.0,g,50,1 butir,52.0,...,0.17,0.2096,0.014,0.0033,1,0,0,0,2700,1400
4,39866,Scrambled Egg,Telur Orak-arik,Egg,protein,100.0,g,50,1 butir,212.0,...,16.18,0.0653,0.0098,0.0763,1,0,0,0,3900,1900


# Calculate BMR/TDEE & Meal Targets

In [4]:
def calculate_bmr(age, weight, height, gender):
    """Calculate Basal Metabolic Rate using Mifflin-St Jeor equation"""
    if gender == 'M':
        return 10 * weight + 6.25 * height - 5 * age + 5
    else:
        return 10 * weight + 6.25 * height - 5 * age - 161

# Activity level multipliers
ACTIVITY_MULTIPLIERS = {
    1: 1.2,    # Sedentary
    2: 1.375,  # Lightly active
    3: 1.55,   # Moderately active
    4: 1.725,  # Very active
    5: 1.9     # Extra active
}

# Calculate BMR and TDEE for each user
users_df['bmr'] = users_df.apply(
    lambda row: calculate_bmr(row['age'], row['weight'], row['height'], row['gender']),
    axis=1
)
users_df['tdee'] = users_df.apply(
    lambda row: row['bmr'] * ACTIVITY_MULTIPLIERS[row['activity']],
    axis=1
)

# Calculate daily calorie target based on goal
def calculate_target_calories(row):
    """Calculate daily calorie target based on user goal"""
    if row['goal'] == 'lose':
        return row['tdee'] - 500
    elif row['goal'] == 'gain':
        return row['tdee'] + 500
    else:  # maintain
        return row['tdee']

users_df['target_calories_daily'] = users_df.apply(calculate_target_calories, axis=1)

# Meal breakdown (percentage of daily intake)
MEAL_BREAKDOWN = {
    'breakfast': 0.25,  # 25%
    'lunch': 0.35,      # 35%
    'dinner': 0.30,     # 30%
    'snack': 0.10       # 10%
}

# Calculate meal-specific targets for each user
for meal_type, percentage in MEAL_BREAKDOWN.items():
    users_df[f'{meal_type}_cal_target'] = users_df['target_calories_daily'] * percentage
    users_df[f'{meal_type}_budget'] = users_df['budget'] * 1000 * percentage

# Generate User-Food Combinations

In [5]:
# Generate all combinations
combinations = []
meal_types = ['breakfast', 'lunch', 'dinner', 'snack']

for _, user in users_df.iterrows():
    for meal_type in meal_types:
        # Filter foods suitable for this meal type
        suitable_foods = food_df[food_df[f'suitable_{meal_type}'] == 1]

        for _, food in suitable_foods.iterrows():
            combinations.append({
                # Identifiers
                'user_id': user['user_id'],
                'food_id': food['food_id'],
                'food_name': food['food_name'],

                # User features (categorical as strings)
                'age': user['age'],
                'weight': user['weight'],
                'height': user['height'],
                'bmi': user['bmi'],
                'activity': user['activity'],
                'goal': user['goal'],              # String: lose/gain/maintain
                'gender': user['gender'],          # String: M/F
                'profile_type': user['profile_type'],

                # Meal context
                'meal_type': meal_type,            # String: breakfast/lunch/dinner/snack
                'meal_cal_target': user[f'{meal_type}_cal_target'],
                'meal_budget': user[f'{meal_type}_budget'],

                # Food features
                'food_category': food['category'],
                'food_calories': food['calories'],
                'food_protein': food['protein'],
                'food_carbs': food['carbs'],
                'food_fat': food['fat'],
                'food_price': food['display_price'],
                'protein_ratio': food['protein_ratio'],
                'carb_ratio': food['carb_ratio'],
                'fat_ratio': food['fat_ratio'],
                'calories_per_100g': food['calories_per_100g'],
                'protein_per_100g': food['protein_per_100g'],
                'carbs_per_100g': food['carbs_per_100g'],
                'fat_per_100g': food['fat_per_100g'],
                'price_per_100g': food['price_per_100g'],
            })

combinations_df = pd.DataFrame(combinations)
print(f"Generated {len(combinations_df):,} user-food-meal combinations")

Generated 11,340 user-food-meal combinations


# Add Derived Features

In [6]:
# 1. Calorie difference percentage
combinations_df['cal_diff_pct'] = (
    abs(combinations_df['food_calories'] - combinations_df['meal_cal_target']) /
    combinations_df['meal_cal_target']
)

# 2. Price affordable (binary)
combinations_df['price_affordable'] = (
    combinations_df['food_price'] <= combinations_df['meal_budget']
).astype(int)

# 3. Calorie match within 25% (binary)
combinations_df['cal_match_25pct'] = (
    combinations_df['cal_diff_pct'] <= 0.25
).astype(int)

# 4. Goal alignment (binary)
def check_goal_alignment(row):
    """Check if food aligns with user's goal"""
    if row['goal'] == 'lose':
        # For weight loss: prefer high protein foods
        return 1 if row['protein_ratio'] >= 0.06 else 0
    elif row['goal'] == 'gain':
        # For weight gain: prefer calorie-dense foods
        return 1 if row['food_calories'] >= row['meal_cal_target'] * 0.8 else 0
    else:  # maintain
        # For maintenance: all foods acceptable
        return 1

combinations_df['goal_aligned'] = combinations_df.apply(check_goal_alignment, axis=1)

print(f"Added 4 derived features")
print(f"   • cal_diff_pct: Continuous (calorie difference %)")
print(f"   • price_affordable: Binary (1=affordable, 0=too expensive)")
print(f"   • cal_match_25pct: Binary (1=within ±25%, 0=outside)")
print(f"   • goal_aligned: Binary (1=aligned, 0=not aligned)")

Added 4 derived features
   • cal_diff_pct: Continuous (calorie difference %)
   • price_affordable: Binary (1=affordable, 0=too expensive)
   • cal_match_25pct: Binary (1=within ±25%, 0=outside)
   • goal_aligned: Binary (1=aligned, 0=not aligned)


# Labeling

In [7]:
def auto_label(row):
    """
    Auto-labeling function - assigns label based on rules
    Returns: (label, confidence, rule_applied)
    """
    price_ratio = row['food_price'] / row['meal_budget']
    cal_diff_pct = abs(row['food_calories'] - row['meal_cal_target']) / row['meal_cal_target']

    # Rule 1: Clearly unsuitable (high confidence)
    if price_ratio > 1.1 or cal_diff_pct > 0.50:
        return 0, 100, 'Clearly unsuitable'

    # Rule 2: Clearly suitable (high confidence)
    if (price_ratio <= 1.0 and
        cal_diff_pct <= 0.25 and
        row['goal_aligned'] == 1):
        return 1, 90, 'Clearly suitable'

    # Rule 3: Borderline suitable - conservative (medium confidence)
    if (price_ratio <= 1.0 and
        cal_diff_pct <= 0.40 and
        row['goal_aligned'] == 1):
        return 1, 75, 'Borderline suitable (relaxed)'

    # Rule 4: Borderline suitable - aggressive (medium confidence)
    if (price_ratio <= 1.05 and
        cal_diff_pct <= 0.50 and
        (row['goal_aligned'] == 1 or row['goal'] == 'maintain')):
        return 1, 70, 'Borderline suitable (flexible)'

    # Rule 5: Default unsuitable (low confidence)
    return 0, 65, 'Default unsuitable'

In [8]:
# Apply auto-labeling to all combinations
print("Applying auto-labeling rules...")
results = combinations_df.apply(auto_label, axis=1, result_type='expand')
combinations_df['label'] = results[0].astype(int)
combinations_df['confidence'] = results[1].astype(int)
combinations_df['rule_applied'] = results[2]

Applying auto-labeling rules...


In [9]:
# Show distribution
label_dist = combinations_df['label'].value_counts()
print(f"\n Auto-labeling complete!")
print(f"   • Unsuitable (0): {label_dist.get(0, 0):,} samples ({label_dist.get(0, 0)/len(combinations_df):.1%})")
print(f"   • Suitable (1):   {label_dist.get(1, 0):,} samples ({label_dist.get(1, 0)/len(combinations_df):.1%})")


 Auto-labeling complete!
   • Unsuitable (0): 10,141 samples (89.4%)
   • Suitable (1):   1,199 samples (10.6%)


# Balanced Sampling

In [10]:
# Separate suitable and unsuitable samples
suitable_samples = combinations_df[combinations_df['label'] == 1]
unsuitable_samples = combinations_df[combinations_df['label'] == 0]

# Balance the dataset (1:1 ratio)
n_suitable = len(suitable_samples)
n_unsuitable = min(n_suitable, len(unsuitable_samples))

# Sample
sampled_suitable = suitable_samples  # Take all suitable
sampled_unsuitable = unsuitable_samples.sample(n=n_unsuitable, random_state=42)

# Combine and shuffle
training_data = pd.concat([sampled_suitable, sampled_unsuitable], ignore_index=True)
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
print(f"Balanced sampling complete:")
print(f"   • Suitable samples: {len(sampled_suitable):,}")
print(f"   • Unsuitable samples: {len(sampled_unsuitable):,}")
print(f"   • Total training samples: {len(training_data):,}")
print(f"   • Ratio: 1:1 (perfectly balanced)")

Balanced sampling complete:
   • Suitable samples: 1,199
   • Unsuitable samples: 1,199
   • Total training samples: 2,398
   • Ratio: 1:1 (perfectly balanced)


# Save Datasets

In [12]:
# Version 1: FULL dataset (with identifiers)
full_columns = [
    # Identifiers
    'user_id', 'food_id', 'food_name',

    # Descriptive info
    'meal_type', 'food_category', 'profile_type',

    # User features (NOT encoded - keep as strings)
    'age', 'weight', 'height', 'bmi', 'activity', 'goal', 'gender',

    # Meal context
    'meal_cal_target', 'meal_budget',

    # Food features
    'food_calories', 'food_protein', 'food_carbs', 'food_fat', 'food_price',
    'protein_ratio', 'carb_ratio', 'fat_ratio',
    'calories_per_100g', 'protein_per_100g', 'carbs_per_100g',
    'fat_per_100g', 'price_per_100g',

    # Derived features
    'cal_diff_pct', 'price_affordable', 'cal_match_25pct', 'goal_aligned',

    # Label info
    'label', 'confidence', 'rule_applied'
]

training_data_full = training_data[full_columns].copy()
training_data_full.to_csv('training_data_FULL.csv', index=False)
print(f"Saved: training_data_FULL.csv ({len(training_data_full):,} rows × {len(full_columns)} columns)")


Saved: training_data_FULL.csv (2,398 rows × 35 columns)


In [13]:
# Version 2: ML dataset (features only, no identifiers)
ml_columns = [
    # User features (NOT encoded)
    'age', 'weight', 'bmi', 'activity', 'goal', 'gender',

    # Meal context
    'meal_type', 'meal_cal_target', 'meal_budget',

    # Food features
    'food_calories', 'food_protein', 'food_carbs', 'food_fat', 'food_price',
    'protein_ratio', 'carb_ratio', 'fat_ratio',

    # Target
    'label'
]

training_data_ml = training_data[ml_columns].copy()
training_data_ml.to_csv('training_data.csv', index=False)
print(f"Saved: training_data.csv ({len(training_data_ml):,} rows × {len(ml_columns)} columns)")

Saved: training_data.csv (2,398 rows × 18 columns)
