# Data Categorization

## Protein Source/type/compexity Classification


In [2]:
import pandas as pd
import re

def classify_protein_type(df):
    """
    Add protein_source, protein_type, and protein_complexity columns:
    - protein_source: Animal-based, Plant-based, Mixed, Unknown
    - protein_type: Specific protein types (Whey, Casein, Pea, Soy, etc.)
    - protein_complexity: Single source or Blend
    """
    
    # Add the new classification columns while keeping all original columns
    df['protein_source'] = 'Unknown'
    df['protein_type'] = 'Unknown'
    df['protein_complexity'] = 'Unknown'
    
    # Clear animal protein indicators with specific types
    animal_keywords = {
        'product_name': {
            'whey': 'Whey',
            'casein': 'Casein', 
            'beef': 'Beef',
            'egg': 'Egg',
            'salmon': 'Salmon'
        },
        'ingredients': {
            'whey protein': 'Whey',
            'casein': 'Casein',
            'milk protein': 'Milk',
            'beef protein': 'Beef',
            'egg protein': 'Egg',
            'egg white': 'Egg',
            'salmon protein': 'Salmon'
        }
    }
    
    # Clear plant protein indicators with specific types
    plant_keywords = {
        'product_name': {
            'pea': 'Pea',
            'soy': 'Soy',
            'soya': 'Soy',
            'rice': 'Rice',
            'hemp': 'Hemp',
            'vegan': 'Plant Blend',
            'plant': 'Plant Blend'
        },
        'ingredients': {
            'pea protein': 'Pea',
            'soy protein': 'Soy',
            'soya protein': 'Soy',
            'rice protein': 'Rice',
            'hemp protein': 'Hemp',
            'spirulina': 'Spirulina'
        }
    }
    
    # Words that indicate ambiguity (should be marked as unknown)
    ambiguous_terms = ['protein blend', 'blend']
    
    for idx, row in df.iterrows():
        product_name = str(row['product_name']).lower()
        ingredients = str(row['ingredients']).lower()
        
        # Step 1: Check product name first for clear indicators
        product_classification = None
        product_type = None
        
        # Check for animal proteins in product name
        for keyword, ptype in animal_keywords['product_name'].items():
            if keyword in product_name:
                product_classification = 'Animal-based'
                product_type = ptype
                break
        
        # Check for plant proteins in product name (only if not already classified as animal)
        if product_classification is None:
            for keyword, ptype in plant_keywords['product_name'].items():
                if keyword in product_name:
                    product_classification = 'Plant-based'
                    product_type = ptype
                    break
        
        # If product name gives clear classification, use it
        if product_classification:
            df.at[idx, 'protein_source'] = product_classification
            df.at[idx, 'protein_type'] = product_type
            continue
        
        # Step 2: Product name unclear, check ingredients carefully
        
        # Check for specific animal proteins in ingredients
        has_animal_ingredient = False
        animal_types_found = []
        for keyword, ptype in animal_keywords['ingredients'].items():
            if keyword in ingredients:
                has_animal_ingredient = True
                animal_types_found.append(ptype)
        
        # Check for specific plant proteins in ingredients (be careful about soy lecithin)
        has_plant_ingredient = False
        plant_types_found = []
        for keyword, ptype in plant_keywords['ingredients'].items():
            if keyword in ingredients:
                # Special handling for soy: make sure it's soy protein, not soy lecithin
                if 'soy' in keyword:
                    # Look for "soy protein" or "soya protein" specifically
                    if re.search(r'\bsoy\s+protein\b|\bsoya\s+protein\b', ingredients):
                        has_plant_ingredient = True
                        plant_types_found.append(ptype)
                        break
                    # Also check for "soy protein isolate" etc
                    elif re.search(r'\bsoy.*protein\b|\bsoya.*protein\b', ingredients):
                        has_plant_ingredient = True
                        plant_types_found.append(ptype)
                        break
                else:
                    has_plant_ingredient = True
                    plant_types_found.append(ptype)
        
        # Check if we found specific proteins even if it says "blend"
        if has_animal_ingredient or has_plant_ingredient:
            # We found specific proteins, so classify based on what we found
            pass  # Continue to classification below
        else:
            # No specific proteins found AND it mentions blend = truly unknown
            is_ambiguous = any(term in ingredients for term in ambiguous_terms)
            if is_ambiguous:
                df.at[idx, 'protein_source'] = 'Unknown'
                df.at[idx, 'protein_type'] = 'Protein Blend (Unspecified)'
                continue
        
        # Final classification based on ingredients
        if has_animal_ingredient and not has_plant_ingredient:
            df.at[idx, 'protein_source'] = 'Animal-based'
            df.at[idx, 'protein_type'] = ', '.join(list(set(animal_types_found)))  # Remove duplicates
        elif has_plant_ingredient and not has_animal_ingredient:
            df.at[idx, 'protein_source'] = 'Plant-based'
            df.at[idx, 'protein_type'] = ', '.join(list(set(plant_types_found)))  # Remove duplicates
        elif has_animal_ingredient and has_plant_ingredient:
            df.at[idx, 'protein_source'] = 'Mixed'
            all_types = list(set(animal_types_found + plant_types_found))
            df.at[idx, 'protein_type'] = ', '.join(all_types)
        else:
            # Step 3: Check for special cases before marking as unknown
            
            # Special case: Spirulina Powder (plant-based protein)
            if 'spirulina' in product_name or 'spirulina' in ingredients:
                df.at[idx, 'protein_source'] = 'Plant-based'
                df.at[idx, 'protein_type'] = 'Spirulina'
                continue
            
            # Step 4: Check for 'plant' in product name (missed in earlier steps)
            if 'plant' in product_name:
                df.at[idx, 'protein_source'] = 'Plant-based'
                df.at[idx, 'protein_type'] = 'Plant Blend'
                continue
            
            # Step 5: Check for mixed animal+plant combinations in product name
            # Look for both animal and plant indicators in the same product name
            has_animal_in_name = any(keyword in product_name for keyword in animal_keywords['product_name'].keys())
            has_plant_in_name = any(keyword in product_name for keyword in plant_keywords['product_name'].keys())
            
            if has_animal_in_name and has_plant_in_name:
                # Find the specific types from product name
                animal_name_types = [ptype for keyword, ptype in animal_keywords['product_name'].items() if keyword in product_name]
                plant_name_types = [ptype for keyword, ptype in plant_keywords['product_name'].items() if keyword in product_name]
                all_name_types = animal_name_types + plant_name_types
                
                df.at[idx, 'protein_source'] = 'Mixed'
                df.at[idx, 'protein_type'] = ', '.join(all_name_types)
                continue
            
            # Step 6: Special case for Colostrum (milk product - animal-based)
            if 'colostrum' in product_name or 'colostrum' in ingredients:
                df.at[idx, 'protein_source'] = 'Animal-based'
                df.at[idx, 'protein_type'] = 'Milk'
                continue
            
            # If none of the above, mark as unknown
            df.at[idx, 'protein_source'] = 'Unknown'
            df.at[idx, 'protein_type'] = 'Unidentified'
    
    # Final step: Determine protein complexity (Single source vs Blend)
    for idx, row in df.iterrows():
        protein_type = str(row['protein_type'])
        
        # Check if protein_type contains multiple types (separated by commas)
        if ',' in protein_type:
            df.at[idx, 'protein_complexity'] = 'Blend'
        elif protein_type in ['Unknown', 'Unidentified', 'Protein Blend (Unspecified)']:
            df.at[idx, 'protein_complexity'] = 'Unknown'
        elif protein_type in ['Plant Blend']:
            df.at[idx, 'protein_complexity'] = 'Blend'
        else:
            df.at[idx, 'protein_complexity'] = 'Single source'
    
    return df

# Main execution
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('cleaned_product_data.csv')
    print(f"Loaded {len(df)} protein powder products")
    
    # Classify protein types with improved logic
    df_classified = classify_protein_type(df)
    
    # Show results
    print("\nProtein Source Distribution:")
    source_distribution = df_classified['protein_source'].value_counts()
    print(source_distribution)
    
    print("\nProtein Type Distribution:")
    type_distribution = df_classified['protein_type'].value_counts()
    print(type_distribution)
    
    print("\nProtein Complexity Distribution:")
    complexity_distribution = df_classified['protein_complexity'].value_counts()
    print(complexity_distribution)
    
    # Show examples by source with their specific types
    print("\nExamples by source, type, and complexity:")
    for protein_source in df_classified['protein_source'].unique():
        print(f"\n=== {protein_source} ===")
        examples = df_classified[df_classified['protein_source'] == protein_source].head(5)
        for _, row in examples.iterrows():
            print(f"Product: {row['product_name']}")
            print(f"Type: {row['protein_type']}")
            print(f"Complexity: {row['protein_complexity']}")
            print(f"Ingredients: {row['ingredients'][:60]}...")
            print("---")
    
    # Check specific cases mentioned
    print("\nSpecific case checks:")
    
    # Check products with "whey" in name
    whey_products = df_classified[df_classified['product_name'].str.contains('Whey', case=False, na=False)]
    print(f"\nProducts with 'Whey' in name ({len(whey_products)} found):")
    for _, row in whey_products.head(5).iterrows():
        print(f"  {row['product_name']} -> {row['protein_type']}")
    
    # Check products with "plant" in name
    plant_products = df_classified[df_classified['product_name'].str.contains('Plant', case=False, na=False)]
    print(f"\nProducts with 'Plant' in name ({len(plant_products)} found):")
    for _, row in plant_products.iterrows():
        print(f"  {row['product_name']} -> {row['protein_type']}")
    
    # Check Spirulina products
    spirulina_products = df_classified[
        df_classified['product_name'].str.contains('Spirulina', case=False, na=False) |
        df_classified['ingredients'].str.contains('spirulina', case=False, na=False)
    ]
    print(f"\nSpirulin products ({len(spirulina_products)} found):")
    for _, row in spirulina_products.iterrows():
        print(f"  {row['product_name']} -> {row['protein_source']}, {row['protein_type']}")
    
    # Check Colostrum products
    colostrum_products = df_classified[
        df_classified['product_name'].str.contains('Colostrum', case=False, na=False) |
        df_classified['ingredients'].str.contains('colostrum', case=False, na=False)
    ]
    print(f"\nColostrum products ({len(colostrum_products)} found):")
    for _, row in colostrum_products.iterrows():
        print(f"  {row['product_name']} -> {row['protein_source']}, {row['protein_type']}")
    
    # Check the specific mixed product mentioned
    mixed_salmon = df_classified[df_classified['product_name'].str.contains('Salmon.*Plant|Plant.*Salmon', case=False, na=False, regex=True)]
    print(f"\nPlant + Salmon mixed products ({len(mixed_salmon)} found):")
    for _, row in mixed_salmon.iterrows():
        print(f"  {row['product_name']} -> {row['protein_source']}, {row['protein_type']}")
        print(f"  Ingredients: {row['ingredients']}")
    
    # Check "Protein Hot Chocolate" specifically
    hot_chocolate = df_classified[df_classified['product_name'].str.contains('Hot Chocolate', case=False, na=False)]
    if len(hot_chocolate) > 0:
        print(f"\nProtein Hot Chocolate check:")
        for _, row in hot_chocolate.iterrows():
            print(f"  {row['product_name']} -> {row['protein_type']}")
            print(f"  Ingredients: {row['ingredients']}")
    
    # Check for soy lecithin vs soy protein handling
    soy_products = df_classified[df_classified['ingredients'].str.contains('soy', case=False, na=False)]
    print(f"\nProducts with 'soy' in ingredients ({len(soy_products)} found) - checking lecithin vs protein:")
    for _, row in soy_products.head(5).iterrows():
        has_lecithin = 'lecithin' in str(row['ingredients']).lower()
        has_soy_protein = any(term in str(row['ingredients']).lower() for term in ['soy protein', 'soya protein'])
        print(f"  {row['product_name']} -> {row['protein_type']}")
        print(f"    Has lecithin: {has_lecithin}, Has soy protein: {has_soy_protein}")
    
    # Save results
    df_classified.to_csv('protein_data_cateogrized.csv', index=False)
    print(f"\nResults saved to 'protein_data_with_types.csv'")
    print(f"Total columns: {len(df_classified.columns)} (original + protein_source + protein_type + protein_complexity)")
    
    # Show summary of protein types by source and complexity
    print("\n=== PROTEIN TYPES BY SOURCE ===")
    for source in df_classified['protein_source'].unique():
        source_data = df_classified[df_classified['protein_source'] == source]
        unique_types = source_data['protein_type'].unique()
        print(f"\n{source} ({len(source_data)} products):")
        for ptype in unique_types:
            count = len(source_data[source_data['protein_type'] == ptype])
            print(f"  • {ptype}: {count} products")
    
    # Show complexity breakdown
    print("\n=== COMPLEXITY BREAKDOWN ===")
    for complexity in df_classified['protein_complexity'].unique():
        complexity_data = df_classified[df_classified['protein_complexity'] == complexity]
        print(f"\n{complexity} ({len(complexity_data)} products):")
        
        # Show examples of each complexity type
        examples = complexity_data[['product_name', 'protein_type']].head(5)
        for _, row in examples.iterrows():
            print(f"  • {row['product_name']} ({row['protein_type']})")
    
    # Show final sample with all three columns
    print(f"\nSample with all classification columns:")
    sample_cols = ['product_name', 'brand', 'protein_source', 'protein_type', 'protein_complexity', 'price_per_100g']
    print(df_classified[sample_cols].head(10).to_string(index=False))

Loaded 103 protein powder products

Protein Source Distribution:
Animal-based    64
Plant-based     34
Mixed            3
Unknown          2
Name: protein_source, dtype: int64

Protein Type Distribution:
Whey                           43
Plant Blend                    22
Casein                         10
Pea                             5
Soy                             3
Rice                            2
Milk, Whey, Casein              2
Salmon, Beef, Rice, Pea         2
Milk, Whey, Egg                 2
Milk, Whey                      2
Protein Blend (Unspecified)     2
Beef                            1
Hemp                            1
Milk, Whey, Egg, Casein         1
Salmon                          1
Spirulina                       1
Whey, Soy, Pea                  1
Egg                             1
Milk                            1
Name: protein_type, dtype: int64

Protein Complexity Distribution:
Single source    69
Blend            32
Unknown           2
Name: protein_complexit

## Organic/ GMO free Mapping

In [3]:
# Define keywords for organic and GMO-free identification
organic_keywords = ['organic', 'organically', 'bio']
gmo_free_keywords = ['gmo free', 'non-gmo', 'non gmo', 'gmo-free']

def check_organic(row):
    """Check if product is organic based on ingredients and product_tags"""
    ingredients = str(row['ingredients']).lower() if pd.notna(row['ingredients']) else ''
    tags = str(row['product_tags']).lower() if pd.notna(row['product_tags']) else ''
    
    return any(keyword in ingredients or keyword in tags for keyword in organic_keywords)

def check_gmo_free(row):
    """Check if product is GMO-free based on ingredients and product_tags"""
    ingredients = str(row['ingredients']).lower() if pd.notna(row['ingredients']) else ''
    tags = str(row['product_tags']).lower() if pd.notna(row['product_tags']) else ''
    
    return any(keyword in ingredients or keyword in tags for keyword in gmo_free_keywords)

# Create the new boolean columns as requested
df_classified['organic_true'] = df_classified.apply(check_organic, axis=1)
df_classified['gmofree_true'] = df_classified.apply(check_gmo_free, axis=1)

print("\n" + "="*50)
print("CATEGORIZATION RESULTS")
print("="*50)

# Display which products were found
organic_products = df[df['organic_true']]
gmo_free_products = df[df['gmofree_true']]

print(f"\nOrganic products found ({len(organic_products)}):")
for idx, product in organic_products.iterrows():
    print(f"  - {product['product_name']}")

print(f"\nGMO-free products found ({len(gmo_free_products)}):")
for idx, product in gmo_free_products.iterrows():
    print(f"  - {product['product_name']}")

# Summary
total_products = len(df)
organic_count = df['organic_true'].sum()
gmo_free_count = df['gmofree_true'].sum()
both_count = (df['organic_true'] & df['gmofree_true']).sum()
regular_count = total_products - organic_count - gmo_free_count + both_count

print(f"\nSUMMARY:")
print(f"  Total products: {total_products}")
print(f"  Organic products: {organic_count}")
print(f"  GMO-free products: {gmo_free_count}")
print(f"  Both organic AND GMO-free: {both_count}")
print(f"  Regular products: {regular_count}")

# Show sample of the enhanced dataset
print("\n" + "="*50)
print("ENHANCED DATASET SAMPLE")
print("="*50)

sample_cols = ['product_name', 'price_per_100g', 'organic_true', 'gmofree_true']
print(df[sample_cols].head(10))

print(f"\n✅ TASK COMPLETED:")
print(f"  - Added 'organic_true' column: {organic_count} True values")
print(f"  - Added 'gmofree_true' column: {gmo_free_count} True values")
print(f"  - Original columns preserved intact")
print(f"  - Dataset ready for price effect analysis")

df_classified.to_csv('protein_data_final.csv', index=False)


CATEGORIZATION RESULTS

Organic products found (19):
  - Organic Plant Protein Powder Vanilla
  - Organic Plant Protein Powder – Maple Cinnamon French Toast
  - Vegan Protein Extreme
  - Organic Plant Protein Powder Chocolate Peanut Butter
  - Dr. Formulated Plant & Sustainable Salmon MD Protein Mediterranean Diet Approved
  - Low Carb 100% Whey Protein Isolate
  - Plant Based Active Protein
  - Warrior Blend Organic Non GMO Plant Based Vegan Protein
  - LIFE Complete Meal Replacement Powder
  - Raw Organic Protein Powder – Plant
  - Vegan Protein Powder
  - SPORT Certified Grass Fed Whey Protein Powder
  - Performance Plant Based Vegan Protein
  - Planta Plant Protein
  - Planta Plant Protein
  - Organic Plant Based Protein Powder
  - Organic Pure Whey Protein™
  - Organic Master Blend Plant Based Protein For Optimal Health and Recovery
  - Keto Friendly Plant Based Collagen with Hyaluronic Acid and Biotin

GMO-free products found (10):
  - Vegan Protein Powder
  - Diet Whey Protein 