In [1]:
import pandas as pd

In [2]:
def generate_price_per_100g(row):
    """
    Generate realistic price per 100g based on category and food type.

    Prices based on:
    - PIHPS (Bank Indonesia) market data - January 2026
    - Real retail prices from Indonesian markets
    - Processing cost adjustments (fried, cooked, etc.)

    Parameters:
    -----------
    row : pandas.Series
        Row from dataframe containing 'category' and 'food_name' columns

    Returns:
    --------
    int : Price per 100g in Rupiah

    Base Prices (Jan 2026):
    - Eggs: Rp 33,850/kg → Rp 3,385/100g
    - Chicken: Rp 42,350/kg → Rp 4,235/100g
    - Beef: Rp 131,400/kg → Rp 13,140/100g
    - Rice (medium): Rp 16,350/kg → Rp 1,635/100g
    """

    category = row['category']
    food_name = row['food_name'].lower()

    # ===== PROTEIN CATEGORY =====
    if category == 'protein':
        # Eggs
        if 'egg' in food_name or 'telur' in food_name:
            base_price = 3385  # Market price Rp 33,850/kg

            if 'white' in food_name or 'putih' in food_name:
                return base_price * 0.8  # Egg white only (no yolk)
            elif 'fried' in food_name or 'goreng' in food_name:
                return base_price * 1.2  # +20% for oil & cooking
            elif 'scrambled' in food_name or 'orak' in food_name:
                return base_price * 1.15  # +15% for oil & cooking
            else:
                return base_price  # Raw/boiled egg

        # Chicken
        elif any(x in food_name for x in ['chicken', 'ayam']):
            base_price = 4235  # Market price Rp 42,350/kg

            if 'breast' in food_name or 'dada' in food_name:
                return base_price * 1.1  # Premium cut
            elif 'thigh' in food_name or 'paha' in food_name:
                return base_price * 1.0  # Standard
            elif 'fried' in food_name or 'goreng' in food_name:
                return base_price * 1.3  # +30% for processing
            elif 'nugget' in food_name:
                return base_price * 1.4  # Processed food
            elif 'soup' in food_name or 'sup' in food_name:
                return base_price * 0.7  # Diluted with water
            else:
                return base_price

        # Beef
        elif any(x in food_name for x in ['beef', 'sapi', 'steak']):
            base_price = 13140  # Market price Rp 131,400/kg (quality II)

            if 'sirloin' in food_name or 'tenderloin' in food_name:
                return base_price * 1.1  # Premium cuts
            elif 'ground' in food_name or 'cincang' in food_name:
                return base_price * 0.9  # Ground meat cheaper
            elif 'soup' in food_name:
                return base_price * 0.6  # Diluted
            else:
                return base_price

        # Fish & Seafood
        elif any(x in food_name for x in ['fish', 'ikan', 'salmon', 'tuna', 'shrimp', 'udang', 'squid', 'cumi']):
            if 'salmon' in food_name:
                return 15000  # Premium fish
            elif 'tuna' in food_name:
                return 8000
            elif 'shrimp' in food_name or 'udang' in food_name:
                return 12000
            elif 'squid' in food_name or 'cumi' in food_name:
                return 5000
            elif 'anchovy' in food_name or 'teri' in food_name:
                return 8000
            else:
                return 6000  # Generic fish

        # Tofu/Tempeh/Soybeans
        elif any(x in food_name for x in ['tofu', 'tahu', 'tempeh', 'soybean', 'kedelai']):
            return 1500

        # Generic soup
        elif 'soup' in food_name or 'sup' in food_name:
            return 2000

        # Default protein
        else:
            return 5000

    # ===== CARBS CATEGORY =====
    elif category == 'carbs':
        # Rice
        if any(x in food_name for x in ['rice', 'nasi']):
            if 'fried' in food_name or 'goreng' in food_name:
                return 2500  # Rice + oil + processing
            elif 'brown' in food_name or 'merah' in food_name:
                return 1725  # Premium rice Rp 17,250/kg
            else:
                return 1635  # Medium rice Rp 16,350/kg

        # Bread
        elif any(x in food_name for x in ['bread', 'roti', 'baguette', 'toast']):
            if 'white' in food_name:
                return 2000
            elif 'whole wheat' in food_name or 'gandum' in food_name:
                return 2500
            else:
                return 2200

        # Potato
        elif any(x in food_name for x in ['potato', 'kentang']):
            if 'fried' in food_name or 'french fries' in food_name:
                return 3000  # Fried = oil cost
            elif 'mashed' in food_name:
                return 1800  # Added butter/milk
            else:
                return 1500  # Plain potato

        # Noodles/Pasta
        elif any(x in food_name for x in ['noodle', 'mie', 'pasta', 'spaghetti']):
            return 1800

        # Cereals
        elif any(x in food_name for x in ['oat', 'cereal', 'granola', 'bran']):
            return 3000

        # Pudding/Desserts
        elif 'pudding' in food_name:
            return 2500

        # Default carbs
        else:
            return 2000

    # ===== FRUITS CATEGORY =====
    elif category == 'fruits':
        if any(x in food_name for x in ['avocado', 'alpukat']):
            return 4000  # Premium fruit
        elif any(x in food_name for x in ['apple', 'apel', 'pear']):
            return 3000
        elif any(x in food_name for x in ['banana', 'pisang']):
            return 2000
        elif any(x in food_name for x in ['grapes', 'anggur']):
            return 5000
        elif any(x in food_name for x in ['orange', 'jeruk', 'citrus']):
            return 2500
        elif any(x in food_name for x in ['watermelon', 'semangka', 'melon']):
            return 1000
        elif any(x in food_name for x in ['strawberry', 'stroberi', 'berries']):
            return 6000
        else:
            return 2500  # Generic fruit

    # ===== VEGETABLES CATEGORY =====
    elif category == 'vegetables':
        if 'soup' in food_name:
            return 1500
        elif any(x in food_name for x in ['potato', 'kentang']):
            return 1500
        elif any(x in food_name for x in ['tomato', 'tomat']):
            return 2000
        elif any(x in food_name for x in ['cabbage', 'kol', 'cauliflower']):
            return 1800
        elif any(x in food_name for x in ['spinach', 'bayam', 'kale']):
            return 2500
        elif any(x in food_name for x in ['eggplant', 'terong']):
            return 2000
        elif 'beans' in food_name:
            return 1500
        else:
            return 1800  # Generic vegetable

    # ===== DAIRY CATEGORY =====
    elif category == 'dairy':
        if any(x in food_name for x in ['milk', 'susu']):
            return 1500  # UHT milk ~Rp 15,000/liter
        elif any(x in food_name for x in ['cheese', 'keju']):
            return 8000  # Cheese expensive
        elif any(x in food_name for x in ['yogurt', 'yoghurt']):
            return 4000
        else:
            return 3000

    # ===== NUTS CATEGORY =====
    elif category == 'nuts':
        if 'almond' in food_name:
            return 15000  # Premium nut
        elif 'cashew' in food_name or 'mete' in food_name:
            return 12000
        elif 'peanut' in food_name or 'kacang' in food_name:
            return 5000
        else:
            return 8000

    # ===== MIXED CATEGORY =====
    elif category == 'mixed':
        if 'soup' in food_name or 'sup' in food_name:
            return 2500
        elif 'sandwich' in food_name or 'burger' in food_name:
            return 4000
        elif 'salad' in food_name:
            return 3000
        else:
            return 3500

    # DEFAULT
    return 2500

In [3]:
def generate_serving_description(row):
    """
    Auto-generate user-friendly serving description based on category and portion size.

    Generates intuitive Indonesian serving descriptions like:
    - "1 butir" for eggs
    - "1 potong sedang" for meat
    - "1 mangkuk" for soup
    - "3/4 piring" for rice

    Parameters:
    -----------
    row : pandas.Series
        Row from dataframe containing 'category', 'food_name', and 'typical_serving_g' columns

    Returns:
    --------
    str : User-friendly serving description in Indonesian
    """

    grams = row['typical_serving_g']
    category = row['category']
    food_name = row['food_name'].lower()

    # ===== PROTEIN CATEGORY =====
    if category == 'protein':
        # Eggs
        if 'egg' in food_name or 'telur' in food_name:
            if grams <= 55:
                return "1 butir"
            else:
                return "2 butir"

        # Meat/Fish (chicken, beef, fish)
        elif any(x in food_name for x in ['chicken', 'beef', 'fish', 'meat', 'steak', 'pork', 'ayam', 'sapi', 'ikan']):
            if grams <= 75:
                return "1 potong kecil"
            elif grams <= 150:
                return "1 potong sedang"
            else:
                return "1 potong besar"

        # Soup
        elif 'soup' in food_name or 'sup' in food_name:
            return "1 mangkuk"

        # Tofu/Tempeh
        elif any(x in food_name for x in ['tofu', 'tahu', 'tempeh']):
            return "1 potong"

        # Default (use grams)
        else:
            return f"{grams}g"

    # ===== CARBS CATEGORY =====
    elif category == 'carbs':
        # Rice
        if any(x in food_name for x in ['rice', 'nasi']):
            if grams <= 100:
                return "1/2 piring"
            elif grams <= 150:
                return "3/4 piring"
            else:
                return "1 piring"

        # Bread/Toast
        elif any(x in food_name for x in ['bread', 'roti', 'toast']):
            if grams <= 40:
                return "1 lembar"
            elif grams <= 80:
                return "2 lembar"
            else:
                return "3 lembar"

        # Baguette
        elif 'baguette' in food_name:
            return "1/2 potong"

        # Potato
        elif any(x in food_name for x in ['potato', 'kentang']):
            if grams <= 100:
                return "1 buah kecil"
            elif grams <= 150:
                return "1 buah sedang"
            else:
                return "1 buah besar"

        # Noodles/Pasta
        elif any(x in food_name for x in ['noodle', 'mie', 'pasta', 'spaghetti']):
            return "1 mangkuk"

        # Cereals
        elif any(x in food_name for x in ['oat', 'cereal', 'granola', 'bran']):
            return "1 mangkuk kecil"

        # Pudding
        elif 'pudding' in food_name:
            return "1 cup"

        # Popcorn
        elif 'popcorn' in food_name:
            return "1 genggam"

        # Default (use grams)
        else:
            return f"{grams}g"

    # ===== FRUITS CATEGORY =====
    elif category == 'fruits':
        # Whole fruits (apple, orange, banana, pear)
        if any(x in food_name for x in ['apple', 'orange', 'pear', 'apel', 'jeruk', 'peach', 'plum']):
            return "1 buah"

        # Banana (can be 1 or 2)
        elif any(x in food_name for x in ['banana', 'pisang']):
            if grams <= 100:
                return "1 buah"
            else:
                return "2 buah"

        # Small fruits (grapes, berries)
        elif any(x in food_name for x in ['grapes', 'anggur', 'berries', 'cherries']):
            return "1 mangkuk kecil"

        # Large fruits (watermelon, melon)
        elif any(x in food_name for x in ['watermelon', 'melon', 'semangka']):
            return "1 potong"

        # Avocado
        elif 'avocado' in food_name or 'alpukat' in food_name:
            return "1/2 buah"

        # Default (use grams)
        else:
            return f"{grams}g"

    # ===== VEGETABLES CATEGORY =====
    elif category == 'vegetables':
        # Soup
        if 'soup' in food_name:
            return "1 mangkuk"
        # Size-based descriptions
        elif grams <= 100:
            return "1 mangkuk"
        elif grams <= 150:
            return "1 mangkuk besar"
        else:
            return "1.5 mangkuk"

    # ===== DAIRY CATEGORY =====
    elif category == 'dairy':
        # Milk
        if any(x in food_name for x in ['milk', 'susu']):
            return "1 gelas"

        # Yogurt
        elif any(x in food_name for x in ['yogurt', 'yoghurt']):
            return "1 cup"

        # Cheese
        elif 'cheese' in food_name or 'keju' in food_name:
            if grams <= 30:
                return "1 slice"
            elif grams <= 60:
                return "2 slices"
            else:
                return f"{grams}g"

        # Default (use grams)
        else:
            return f"{grams}g"

    # ===== NUTS CATEGORY =====
    elif category == 'nuts':
        return "1 genggam kecil"

    # ===== MIXED CATEGORY =====
    elif category == 'mixed':
        # Soup
        if 'soup' in food_name or 'sup' in food_name:
            return "1 mangkuk"

        # Sandwich/Burger
        elif any(x in food_name for x in ['sandwich', 'burger', 'wrap']):
            return "1 porsi"

        # Salad
        elif 'salad' in food_name:
            return "1 mangkuk"

        # Default
        else:
            return "1 porsi"

    # DEFAULT
    return f"{grams}g"

In [4]:
def round_to_nearest(value, base):
    """
    Round value to nearest base.

    Parameters:
    -----------
    value : float
        Value to round
    base : int
        Base to round to (e.g., 50, 100, 500)

    Returns:
    --------
    int : Rounded value

    Examples:
    ---------
    >>> round_to_nearest(3385, 50)
    3400
    >>> round_to_nearest(1692.5, 100)
    1700
    """
    return int(base * round(value / base))


In [5]:
def round_display_price(price):
    """
    Smart rounding for display price based on value range.
    Matches Indonesian pricing conventions.

    Parameters:
    -----------
    price : float
        Raw price to round

    Returns:
    --------
    int : Rounded price

    Rounding Rules:
    ---------------
    - < Rp 1,000: nearest Rp 50 (e.g., 500, 550, 600)
    - Rp 1,000-5,000: nearest Rp 100 (e.g., 1,500, 2,000, 2,500)
    - > Rp 5,000: nearest Rp 500 (e.g., 5,500, 10,000, 12,500)
    """
    if price < 1000:
        return round_to_nearest(price, 50)
    elif price < 5000:
        return round_to_nearest(price, 100)
    else:
        return round_to_nearest(price, 500)

In [6]:
def process_food_dataset(df):
    """
    Complete processing pipeline for food dataset.
    Adds price and serving description columns.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe with columns: 'category', 'food_name', 'typical_serving_g'

    Returns:
    --------
    pandas.DataFrame : Processed dataframe with added columns:
        - price_per_100g: Standardized price per 100g (rounded to Rp 50)
        - display_price: Actual price per serving (smart rounded)
        - serving_description: User-friendly serving description

    Example:
    --------
    >>> df = pd.read_csv('food_database.csv')
    >>> df_processed = process_food_dataset(df)
    >>> df_processed.to_csv('food_database_with_prices.csv', index=False)
    """

    # Generate raw prices
    df['price_per_100g_raw'] = df.apply(generate_price_per_100g, axis=1)
    df['display_price_raw'] = (df['price_per_100g_raw'] * df['typical_serving_g']) / 100

    # Round prices
    df['price_per_100g'] = df['price_per_100g_raw'].apply(lambda x: round_to_nearest(x, 50))
    df['display_price'] = df['display_price_raw'].apply(round_display_price)

    # Generate serving descriptions
    df['serving_description'] = df.apply(generate_serving_description, axis=1)

    # Clean up temporary columns
    df = df.drop(['price_per_100g_raw', 'display_price_raw'], axis=1)

    return df

In [7]:
# Load dataset
df = pd.read_csv('food_dataset_clean.csv')

# Process dataset
df_processed = process_food_dataset(df)

# Save result
df_processed.to_csv('food_database_ready_for_modeling.csv', index=False)

print("Processing complete!")
print(f"Total foods processed: {len(df_processed)}")
print("\nSample results:")
print(df_processed[['food_name', 'serving_description', 'price_per_100g', 'display_price']].head(10))


Processing complete!
Total foods processed: 200

Sample results:
                           food_name serving_description  price_per_100g  \
0                                Egg             1 butir            3400   
1                          Fried Egg             1 butir            4050   
2                         Boiled Egg             1 butir            3400   
3                          Egg White             1 butir            2700   
4                      Scrambled Egg             1 butir            3900   
5  Vegetable Beef Soup (Home Recipe)      1 potong besar            7900   
6       Vegetable Soup (Home Recipe)           1 mangkuk            2500   
7                        Tomato Soup           1 mangkuk            2500   
8                       Chicken Soup      1 potong besar            2950   
9                   Mixed Beans Soup           1 mangkuk            2500   

   display_price  
0           1700  
1           2000  
2           1700  
3           1400  
4  