In [1]:
import pandas as pd
import numpy as np

# Load recipe dataset
recipes_df = pd.read_csv('../data/recipe_nutrients.csv')

print("=" * 60)
print("RECIPE DATASET OVERVIEW")
print("=" * 60)

# Basic info
print(f"\nTotal recipes: {len(recipes_df)}")
print(f"Total columns: {len(recipes_df.columns)}")

# Check data quality
print("\n" + "=" * 60)
print("DATA QUALITY CHECK")
print("=" * 60)

print("\nMissing values in key columns:")
key_cols = ['RecipeName', 'Cuisine', 'Course', 'Diet', 
            'energy_per_serving', 'protein_per_serving', 
            'fat_per_serving', 'carbohydrate_per_serving']
for col in key_cols:
    missing = recipes_df[col].isnull().sum()
    print(f"  {col}: {missing} ({missing/len(recipes_df)*100:.2f}%)")

# Distribution analysis
print("\n" + "=" * 60)
print("RECIPE DISTRIBUTION")
print("=" * 60)

print("\nCuisine breakdown:")
print(recipes_df['Cuisine'].value_counts().head(10))

print("\nCourse breakdown:")
print(recipes_df['Course'].value_counts())

print("\nDiet type breakdown:")
print(recipes_df['Diet'].value_counts())

# Nutrition statistics
print("\n" + "=" * 60)
print("NUTRITION STATISTICS (Per Serving)")
print("=" * 60)

nutrition_cols = ['energy_per_serving', 'protein_per_serving', 
                  'fat_per_serving', 'carbohydrate_per_serving']
print(recipes_df[nutrition_cols].describe())

# Check for unrealistic values
print("\n" + "=" * 60)
print("DATA ANOMALY DETECTION")
print("=" * 60)

# Recipes with extremely high/low calories
print("\nRecipes with <50 calories per serving (suspicious):")
print(len(recipes_df[recipes_df['energy_per_serving'] < 50]))

print("\nRecipes with >1000 calories per serving (very high):")
print(len(recipes_df[recipes_df['energy_per_serving'] > 1000]))

# Check time constraints
print("\n" + "=" * 60)
print("TIME CONSTRAINTS")
print("=" * 60)

print("\nTotal time statistics:")
print(recipes_df['TotalTimeInMins'].describe())

print("\nRecipes by time category:")
print(f"  Quick (<20 min): {len(recipes_df[recipes_df['TotalTimeInMins'] < 20])}")
print(f"  Medium (20-45 min): {len(recipes_df[(recipes_df['TotalTimeInMins'] >= 20) & (recipes_df['TotalTimeInMins'] <= 45)])}")
print(f"  Long (>45 min): {len(recipes_df[recipes_df['TotalTimeInMins'] > 45])}")

# Sample recipes
print("\n" + "=" * 60)
print("SAMPLE RECIPES")
print("=" * 60)

print("\nFirst 3 breakfast recipes:")
breakfast = recipes_df[recipes_df['Course'] == 'Breakfast'].head(3)
for idx, row in breakfast.iterrows():
    print(f"\n{row['RecipeName']}:")
    print(f"  Cuisine: {row['Cuisine']} | Diet: {row['Diet']}")
    print(f"  Time: {row['TotalTimeInMins']} min")
    print(f"  Nutrition: {row['energy_per_serving']:.0f} cal | {row['protein_per_serving']:.1f}g P | {row['carbohydrate_per_serving']:.1f}g C | {row['fat_per_serving']:.1f}g F")


RECIPE DATASET OVERVIEW

Total recipes: 6264
Total columns: 34

DATA QUALITY CHECK

Missing values in key columns:
  RecipeName: 0 (0.00%)
  Cuisine: 0 (0.00%)
  Course: 0 (0.00%)
  Diet: 0 (0.00%)
  energy_per_serving: 0 (0.00%)
  protein_per_serving: 0 (0.00%)
  fat_per_serving: 0 (0.00%)
  carbohydrate_per_serving: 0 (0.00%)

RECIPE DISTRIBUTION

Cuisine breakdown:
Cuisine
Indian                   1012
Continental               997
North Indian Recipes      811
South Indian Recipes      593
Italian Recipes           234
Bengali Recipes           158
Maharashtrian Recipes     155
Kerala Recipes            145
Tamil Nadu                144
Karnataka                 143
Name: count, dtype: int64

Course breakdown:
Course
Lunch                           1528
Side Dish                        882
Snack                            837
Dinner                           701
Dessert                          638
Appetizer                        600
Main Course                      295
World Brea

In [2]:
#Step 1: Remove Extreme Outliers

# Physiologically realistic limits per serving
cleaned_df = recipes_df[
    (recipes_df['energy_per_serving'] >= 50) &      # Min 50 cal
    (recipes_df['energy_per_serving'] <= 1500) &    # Max 1,500 cal
    (recipes_df['protein_per_serving'] >= 0) &
    (recipes_df['protein_per_serving'] <= 100) &    # Max 100g protein
    (recipes_df['fat_per_serving'] >= 0) &
    (recipes_df['fat_per_serving'] <= 100) &        # Max 100g fat
    (recipes_df['carbohydrate_per_serving'] >= 0) &
    (recipes_df['carbohydrate_per_serving'] <= 200) # Max 200g carbs
].copy()

print(f"Original recipes: {len(recipes_df)}")
print(f"After cleaning: {len(cleaned_df)}")
print(f"Removed: {len(recipes_df) - len(cleaned_df)} ({(len(recipes_df) - len(cleaned_df))/len(recipes_df)*100:.1f}%)")



Original recipes: 6264
After cleaning: 4781
Removed: 1483 (23.7%)


In [3]:
#Step 2: Fix Course Categories

# Map misclassified diet categories back to unknown course
diet_terms = ['Vegetarian', 'Non Vegeterian', 'Vegan', 'Eggetarian', 
              'High Protein Vegetarian', 'High Protein Non Vegetarian',
              'No Onion No Garlic', 'Sugar Free', 'Gluten Free']

# If Course contains diet terms, set to 'Main Course' or investigate
cleaned_df['Course'] = cleaned_df['Course'].apply(
    lambda x: 'Main Course' if x in diet_terms else x
)


In [4]:
#Step 3: Standardize Course Names

# Combine breakfast categories
course_mapping = {
    'World Breakfast': 'Breakfast',
    'South Indian Breakfast': 'Breakfast',
    'North Indian Breakfast': 'Breakfast',
    'Indian Breakfast': 'Breakfast'
}

cleaned_df['Course'] = cleaned_df['Course'].replace(course_mapping)

print("\nCleaned course distribution:")
print(cleaned_df['Course'].value_counts())



Cleaned course distribution:
Course
Lunch                           1180
Side Dish                        734
Snack                            620
Dinner                           545
Appetizer                        506
Breakfast                        481
Dessert                          437
Main Course                      253
One Pot Dish                      20
Brunch                             4
No Onion No Garlic (Sattvic)       1
Name: count, dtype: int64


In [5]:
cleaned_df['Course'].unique()

array(['Side Dish', 'Main Course', 'Breakfast', 'Lunch', 'Snack',
       'Dinner', 'Appetizer', 'Dessert', 'No Onion No Garlic (Sattvic)',
       'One Pot Dish', 'Brunch'], dtype=object)

In [6]:
cleaned_df['Diet'].unique()

array(['Diabetic Friendly', 'Vegetarian', 'High Protein Vegetarian',
       'Non Vegeterian', 'High Protein Non Vegetarian', 'Eggetarian',
       'No Onion No Garlic (Sattvic)', 'Gluten Free', 'Vegan',
       'Sugar Free Diet'], dtype=object)

In [8]:
import pandas as pd
import numpy as np

# Load data
recipes_df = pd.read_csv('../data/recipe_nutrients.csv')

print("=" * 60)
print("DATA CLEANING PIPELINE")
print("=" * 60)

original_count = len(recipes_df)

# Step 1: Remove nutritional outliers
print("\n[Step 1] Removing nutritional outliers...")

cleaned_df = recipes_df[
    (recipes_df['energy_per_serving'] >= 50) &
    (recipes_df['energy_per_serving'] <= 1500) &
    (recipes_df['protein_per_serving'] >= 0) &
    (recipes_df['protein_per_serving'] <= 100) &
    (recipes_df['fat_per_serving'] >= 0) &
    (recipes_df['fat_per_serving'] <= 100) &
    (recipes_df['carbohydrate_per_serving'] >= 0) &
    (recipes_df['carbohydrate_per_serving'] <= 200)
].copy()

print(f"  Original: {original_count} recipes")
print(f"  After nutritional filtering: {len(cleaned_df)} recipes")
print(f"  Removed: {original_count - len(cleaned_df)} outliers")

# Step 2: Fix Course category - "No Onion No Garlic" should not be a course
print("\n[Step 2] Fixing misclassified course...")

# This is clearly a diet preference, not a course
cleaned_df.loc[cleaned_df['Course'] == 'No Onion No Garlic (Sattvic)', 'Course'] = 'Main Course'

print(f"  Fixed 'No Onion No Garlic (Sattvic)' in Course column")

# Step 3: Standardize course names (keep useful distinctions)
print("\n[Step 3] Standardizing course categories...")

# Note: Keep Breakfast, Lunch, Dinner, Snack, Dessert, Appetizer, Side Dish
# These are all valid and useful for meal planning

print("  Kept all valid course categories")
print(f"  Valid courses: {sorted(cleaned_df['Course'].unique())}")

# Step 4: Remove invalid time values
print("\n[Step 4] Cleaning time values...")

time_before = len(cleaned_df)
cleaned_df = cleaned_df[
    (cleaned_df['TotalTimeInMins'] > 0) &
    (cleaned_df['TotalTimeInMins'] <= 240)  # Max 4 hours
].copy()
print(f"  Removed {time_before - len(cleaned_df)} recipes with invalid times")

# Step 5: Ensure sodium values are reasonable
print("\n[Step 5] Cleaning sodium values...")

sodium_before = len(cleaned_df)
cleaned_df = cleaned_df[
    (cleaned_df['sodium_per_serving'] >= 0) &
    (cleaned_df['sodium_per_serving'] <= 3000)  # Max 3000mg per serving
].copy()
print(f"  Removed {sodium_before - len(cleaned_df)} recipes with extreme sodium")

# Final statistics
print("\n" + "=" * 60)
print("CLEANED DATASET SUMMARY")
print("=" * 60)

final_count = len(cleaned_df)
print(f"\nOriginal recipes: {original_count}")
print(f"Final recipe count: {final_count}")
print(f"Total removed: {original_count - final_count} ({(original_count - final_count)/original_count*100:.1f}%)")

print("\n✓ Course distribution (After cleaning):")
print(cleaned_df['Course'].value_counts())

print("\n✓ Diet distribution:")
print(cleaned_df['Diet'].value_counts())

print("\n✓ Cuisine distribution (Top 10):")
print(cleaned_df['Cuisine'].value_counts().head(10))

print("\n✓ Cleaned nutrition statistics (per serving):")
nutrition_stats = cleaned_df[['energy_per_serving', 'protein_per_serving', 
                               'fat_per_serving', 'carbohydrate_per_serving']].describe()
print(nutrition_stats)

print("\n✓ Time statistics:")
print(f"  Average: {cleaned_df['TotalTimeInMins'].mean():.1f} minutes")
print(f"  Median: {cleaned_df['TotalTimeInMins'].median():.1f} minutes")
print(f"  Quick (<30 min): {len(cleaned_df[cleaned_df['TotalTimeInMins'] < 30])} recipes")
print(f"  Medium (30-60 min): {len(cleaned_df[(cleaned_df['TotalTimeInMins'] >= 30) & (cleaned_df['TotalTimeInMins'] <= 60)])} recipes")
print(f"  Long (>60 min): {len(cleaned_df[cleaned_df['TotalTimeInMins'] > 60])} recipes")

# Save cleaned dataset
output_file = 'recipe_nutrients_cleaned.csv'
cleaned_df.to_csv(output_file, index=False)
print(f"\n✓ Cleaned dataset saved: {output_file}")

# Show sample recipes from each course
print("\n" + "=" * 60)
print("SAMPLE RECIPES (After Cleaning)")
print("=" * 60)

for course in ['Breakfast', 'Lunch', 'Dinner', 'Snack']:
    course_recipes = cleaned_df[cleaned_df['Course'] == course]
    if len(course_recipes) > 0:
        sample = course_recipes.sample(1).iloc[0]
        print(f"\n{course.upper()}: {sample['RecipeName']}")
        print(f"  Cuisine: {sample['Cuisine']} | Diet: {sample['Diet']}")
        print(f"  Time: {sample['TotalTimeInMins']:.0f} min | Servings: {sample['Servings']}")
        print(f"  Per Serving: {sample['energy_per_serving']:.0f} cal | " +
              f"{sample['protein_per_serving']:.1f}g P | " +
              f"{sample['carbohydrate_per_serving']:.1f}g C | " +
              f"{sample['fat_per_serving']:.1f}g F")

# Check for recipes matching TIER 1 breakfast target (583 cal, 51g protein)
print("\n" + "=" * 60)
print("TIER 1 INTEGRATION TEST")
print("=" * 60)
print("\nTarget: 583 calories, 51g protein (Breakfast)")

breakfast_recipes = cleaned_df[cleaned_df['Course'] == 'Breakfast']
target_cal = 583
target_protein = 51

# Find recipes within 20% of target
tolerance = 0.20
breakfast_matches = breakfast_recipes[
    (breakfast_recipes['energy_per_serving'] >= target_cal * (1 - tolerance)) &
    (breakfast_recipes['energy_per_serving'] <= target_cal * (1 + tolerance)) &
    (breakfast_recipes['protein_per_serving'] >= target_protein * (1 - tolerance)) &
    (breakfast_recipes['protein_per_serving'] <= target_protein * (1 + tolerance))
]

print(f"Found {len(breakfast_matches)} breakfast recipes matching target (±20%)")

if len(breakfast_matches) > 0:
    print("\nTop 3 matches:")
    for idx, (_, recipe) in enumerate(breakfast_matches.head(3).iterrows(), 1):
        cal_diff = abs(recipe['energy_per_serving'] - target_cal)
        protein_diff = abs(recipe['protein_per_serving'] - target_protein)
        print(f"\n{idx}. {recipe['RecipeName']}")
        print(f"   {recipe['energy_per_serving']:.0f} cal (Δ{cal_diff:.0f}) | " +
              f"{recipe['protein_per_serving']:.1f}g P (Δ{protein_diff:.1f}g)")
        print(f"   Diet: {recipe['Diet']} | Time: {recipe['TotalTimeInMins']:.0f} min")

print("\n" + "=" * 60)
print("✅ DATA CLEANING COMPLETE!")
print("=" * 60)


DATA CLEANING PIPELINE

[Step 1] Removing nutritional outliers...
  Original: 6264 recipes
  After nutritional filtering: 4781 recipes
  Removed: 1483 outliers

[Step 2] Fixing misclassified course...
  Fixed 'No Onion No Garlic (Sattvic)' in Course column

[Step 3] Standardizing course categories...
  Kept all valid course categories
  Valid courses: ['Appetizer', 'Brunch', 'Dessert', 'Dinner', 'Eggetarian', 'High Protein Vegetarian', 'Indian Breakfast', 'Lunch', 'Main Course', 'North Indian Breakfast', 'One Pot Dish', 'Side Dish', 'Snack', 'South Indian Breakfast', 'Vegan', 'Vegetarian', 'World Breakfast']

[Step 4] Cleaning time values...
  Removed 143 recipes with invalid times

[Step 5] Cleaning sodium values...
  Removed 417 recipes with extreme sodium

CLEANED DATASET SUMMARY

Original recipes: 6264
Final recipe count: 4221
Total removed: 2043 (32.6%)

✓ Course distribution (After cleaning):
Course
Lunch                      1066
Side Dish                   667
Snack            

In [9]:
# Save cleaned dataset
output_file = 'recipe_nutrients_cleaned.csv'
cleaned_df.to_csv(output_file, index=False)

print(f"✓ Saved {len(cleaned_df)} recipes to: {output_file}")

# Verify file was created
import os
if os.path.exists(output_file):
    file_size = os.path.getsize(output_file) / (1024 * 1024)  # MB
    print(f"✓ File exists: {file_size:.2f} MB")
else:
    print("✗ File NOT created - check write permissions")

✓ Saved 4221 recipes to: recipe_nutrients_cleaned.csv
✓ File exists: 20.18 MB
