# Week 3: Exploratory Data Analysis (EDA)

This notebook covers:
1. Statistical analysis of datasets
2. Data visualization and patterns
3. Correlation analysis
4. Feature distribution analysis
5. Data quality assessment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set up paths
DATA_RAW = Path('../data/raw')
DATA_PROCESSED = Path('../data/processed')
REPORTS = Path('../reports/figures')
REPORTS.mkdir(exist_ok=True)

print("✅ Libraries imported and paths set up")

In [None]:
# Load all available datasets
print("📊 Loading datasets...")

# Load ingredient toxicity database
ingredient_db = pd.read_csv(DATA_PROCESSED / 'ingredient_toxicity_db.csv')
print(f"✅ Ingredient DB: {ingredient_db.shape}")

# Load Kaggle food datasets
try:
    food_df = pd.read_csv(DATA_RAW / 'food.csv')
    print(f"✅ Food dataset: {food_df.shape}")
except:
    print("⚠️ Food.csv not found")
    food_df = None

try:
    food1_df = pd.read_csv(DATA_RAW / 'food1.csv')
    print(f"✅ Food1 dataset: {food1_df.shape}")
except:
    print("⚠️ Food1.csv not found")
    food1_df = None

print("\n📋 Available datasets for analysis:")
print(f"  - Ingredient Toxicity DB: {len(ingredient_db)} ingredients")
if food_df is not None:
    print(f"  - Food Dataset: {len(food_df)} records")
if food1_df is not None:
    print(f"  - Food1 Dataset: {len(food1_df)} records")

In [None]:
# Detailed analysis of ingredient database
print("🔍 INGREDIENT DATABASE ANALYSIS")
print("=" * 50)

# Basic info
print("\n📊 Basic Information:")
print(f"Shape: {ingredient_db.shape}")
print(f"Columns: {list(ingredient_db.columns)}")

# Data types
print("\n🏷️ Data Types:")
print(ingredient_db.dtypes)

# Missing values
print("\n❓ Missing Values:")
missing = ingredient_db.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values")

# Summary statistics
print("\n📈 Summary Statistics:")
ingredient_db.describe()

In [None]:
# Categorical analysis
print("🏷️ CATEGORICAL ANALYSIS")
print("=" * 40)

categorical_cols = ['category', 'health_impact', 'risk_level', 'allergen_risk']

for col in categorical_cols:
    if col in ingredient_db.columns:
        print(f"\n📊 {col.upper()} Distribution:")
        counts = ingredient_db[col].value_counts()
        percentages = ingredient_db[col].value_counts(normalize=True) * 100
        
        for category, count in counts.items():
            pct = percentages[category]
            print(f"  {category}: {count} ({pct:.1f}%)")

In [None]:
# Create comprehensive visualizations
print("📊 Creating visualizations...")

# Set up the figure with subplots
fig = plt.figure(figsize=(20, 15))

# 1. Toxicity Score Distribution
plt.subplot(3, 3, 1)
plt.hist(ingredient_db['toxicity_score'], bins=15, alpha=0.7, color='red', edgecolor='black')
plt.title('Toxicity Score Distribution', fontweight='bold')
plt.xlabel('Toxicity Score (0-100)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# 2. Risk Level Pie Chart
plt.subplot(3, 3, 2)
risk_counts = ingredient_db['risk_level'].value_counts()
colors = ['green', 'yellow', 'orange', 'red']
plt.pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', 
        colors=colors[:len(risk_counts)], startangle=90)
plt.title('Risk Level Distribution', fontweight='bold')

# 3. Category Bar Chart
plt.subplot(3, 3, 3)
category_counts = ingredient_db['category'].value_counts()
bars = plt.bar(range(len(category_counts)), category_counts.values, 
               color=sns.color_palette("husl", len(category_counts)))
plt.title('Ingredient Categories', fontweight='bold')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(range(len(category_counts)), category_counts.index, rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

# 4. Toxicity by Category Box Plot
plt.subplot(3, 3, 4)
sns.boxplot(data=ingredient_db, x='risk_level', y='toxicity_score', 
            order=['Safe', 'Low Risk', 'Medium Risk', 'High Risk'])
plt.title('Toxicity Score by Risk Level', fontweight='bold')
plt.xticks(rotation=45)

# 5. Health Impact Distribution
plt.subplot(3, 3, 5)
health_counts = ingredient_db['health_impact'].value_counts()
plt.bar(health_counts.index, health_counts.values, 
        color=['green', 'lightgreen', 'yellow', 'orange', 'red'][:len(health_counts)])
plt.title('Health Impact Distribution', fontweight='bold')
plt.xlabel('Health Impact')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')

# 6. Allergen Risk Analysis
plt.subplot(3, 3, 6)
allergen_counts = ingredient_db['allergen_risk'].value_counts()
plt.pie(allergen_counts.values, labels=allergen_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Allergen Risk Distribution', fontweight='bold')

# 7. Toxicity Score vs Category Scatter
plt.subplot(3, 3, 7)
categories = ingredient_db['category'].unique()
for i, cat in enumerate(categories):
    cat_data = ingredient_db[ingredient_db['category'] == cat]
    plt.scatter([i] * len(cat_data), cat_data['toxicity_score'], 
               alpha=0.6, s=50, label=cat)
plt.title('Toxicity Scores by Category', fontweight='bold')
plt.xlabel('Category')
plt.ylabel('Toxicity Score')
plt.xticks(range(len(categories)), categories, rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# 8. Binary Classification Distribution
plt.subplot(3, 3, 8)
toxic_counts = ingredient_db['is_toxic'].value_counts()
labels = ['Non-Toxic (≤50)', 'Toxic (>50)']
plt.bar(labels, toxic_counts.values, color=['green', 'red'], alpha=0.7)
plt.title('Binary Toxicity Classification', fontweight='bold')
plt.ylabel('Count')
for i, v in enumerate(toxic_counts.values):
    plt.text(i, v + 0.5, str(v), ha='center', fontweight='bold')

# 9. Top 10 Most Toxic Ingredients
plt.subplot(3, 3, 9)
top_toxic = ingredient_db.nlargest(10, 'toxicity_score')
plt.barh(range(len(top_toxic)), top_toxic['toxicity_score'], color='red', alpha=0.7)
plt.title('Top 10 Most Toxic Ingredients', fontweight='bold')
plt.xlabel('Toxicity Score')
plt.yticks(range(len(top_toxic)), top_toxic['ingredient_name'])
plt.gca().invert_yaxis()

plt.tight_layout()
plt.savefig(REPORTS / 'week3_comprehensive_eda.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Comprehensive EDA visualizations created")

In [None]:
# Correlation analysis
print("🔗 CORRELATION ANALYSIS")
print("=" * 30)

# Select numeric columns for correlation
numeric_cols = ingredient_db.select_dtypes(include=[np.number]).columns
print(f"Numeric columns: {list(numeric_cols)}")

if len(numeric_cols) > 1:
    # Calculate correlation matrix
    corr_matrix = ingredient_db[numeric_cols].corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0, 
                square=True, fmt='.3f', cbar_kws={'shrink': 0.8})
    plt.title('Correlation Matrix - Ingredient Database', fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig(REPORTS / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n📊 Correlation Matrix:")
    print(corr_matrix)
else:
    print("⚠️ Not enough numeric columns for correlation analysis")

In [None]:
# Analyze food datasets if available
if food_df is not None:
    print("🍎 FOOD DATASET ANALYSIS")
    print("=" * 40)
    
    # Basic info
    print(f"\n📊 Food Dataset Shape: {food_df.shape}")
    print(f"Columns: {list(food_df.columns)}")
    
    # First few rows
    print("\n🔍 First 5 rows:")
    display(food_df.head())
    
    # Data types and missing values
    print("\n🏷️ Data Info:")
    print(food_df.info())
    
    # Missing values analysis
    missing_food = food_df.isnull().sum()
    if missing_food.sum() > 0:
        print("\n❓ Missing Values:")
        missing_pct = (missing_food / len(food_df)) * 100
        missing_df = pd.DataFrame({
            'Missing Count': missing_food[missing_food > 0],
            'Missing %': missing_pct[missing_food > 0]
        })
        print(missing_df)
    
    # Numeric columns analysis
    numeric_food_cols = food_df.select_dtypes(include=[np.number]).columns
    if len(numeric_food_cols) > 0:
        print(f"\n📈 Numeric columns: {list(numeric_food_cols)}")
        print("\nSummary Statistics:")
        display(food_df[numeric_food_cols].describe())
else:
    print("⚠️ Food dataset not available for analysis")

In [None]:
# Statistical insights and patterns
print("📈 STATISTICAL INSIGHTS")
print("=" * 35)

# Toxicity statistics by category
print("\n🏷️ Toxicity Statistics by Category:")
category_stats = ingredient_db.groupby('category')['toxicity_score'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
category_stats = category_stats.sort_values('mean', ascending=False)
print(category_stats)

# Risk level statistics
print("\n⚠️ Risk Level Statistics:")
risk_stats = ingredient_db.groupby('risk_level')['toxicity_score'].agg([
    'count', 'mean', 'min', 'max'
]).round(2)
print(risk_stats)

# Most and least toxic ingredients
print("\n🔴 TOP 5 MOST TOXIC INGREDIENTS:")
most_toxic = ingredient_db.nlargest(5, 'toxicity_score')[['ingredient_name', 'toxicity_score', 'category', 'health_impact']]
for idx, row in most_toxic.iterrows():
    print(f"  {row['ingredient_name']}: {row['toxicity_score']} ({row['category']}, {row['health_impact']})")

print("\n🟢 TOP 5 SAFEST INGREDIENTS:")
safest = ingredient_db.nsmallest(5, 'toxicity_score')[['ingredient_name', 'toxicity_score', 'category', 'health_impact']]
for idx, row in safest.iterrows():
    print(f"  {row['ingredient_name']}: {row['toxicity_score']} ({row['category']}, {row['health_impact']})")

In [None]:
# Data quality assessment
print("✅ DATA QUALITY ASSESSMENT")
print("=" * 40)

# Check for duplicates
duplicates = ingredient_db.duplicated().sum()
print(f"\n🔄 Duplicate rows: {duplicates}")

# Check for outliers in toxicity scores
Q1 = ingredient_db['toxicity_score'].quantile(0.25)
Q3 = ingredient_db['toxicity_score'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = ingredient_db[
    (ingredient_db['toxicity_score'] < lower_bound) | 
    (ingredient_db['toxicity_score'] > upper_bound)
]

print(f"\n📊 Outliers in toxicity scores: {len(outliers)}")
if len(outliers) > 0:
    print("Outlier ingredients:")
    for idx, row in outliers.iterrows():
        print(f"  {row['ingredient_name']}: {row['toxicity_score']}")

# Data completeness
completeness = (1 - ingredient_db.isnull().sum() / len(ingredient_db)) * 100
print(f"\n📋 Data Completeness:")
for col, pct in completeness.items():
    print(f"  {col}: {pct:.1f}%")

# Overall data quality score
avg_completeness = completeness.mean()
quality_score = avg_completeness * (1 - duplicates/len(ingredient_db))
print(f"\n🎯 Overall Data Quality Score: {quality_score:.1f}%")

In [None]:
# Generate EDA summary report
print("📋 EDA SUMMARY REPORT")
print("=" * 30)

summary_report = {
    'Dataset Info': {
        'Total Ingredients': len(ingredient_db),
        'Features': len(ingredient_db.columns),
        'Numeric Features': len(ingredient_db.select_dtypes(include=[np.number]).columns),
        'Categorical Features': len(ingredient_db.select_dtypes(include=['object']).columns)
    },
    'Toxicity Analysis': {
        'Mean Toxicity Score': ingredient_db['toxicity_score'].mean(),
        'Median Toxicity Score': ingredient_db['toxicity_score'].median(),
        'Std Deviation': ingredient_db['toxicity_score'].std(),
        'High Risk Ingredients': len(ingredient_db[ingredient_db['risk_level'] == 'High Risk']),
        'Safe Ingredients': len(ingredient_db[ingredient_db['risk_level'] == 'Safe'])
    },
    'Data Quality': {
        'Missing Values': ingredient_db.isnull().sum().sum(),
        'Duplicate Rows': duplicates,
        'Outliers': len(outliers),
        'Quality Score': f"{quality_score:.1f}%"
    }
}

for section, metrics in summary_report.items():
    print(f"\n📊 {section}:")
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"  {metric}: {value:.2f}")
        else:
            print(f"  {metric}: {value}")

# Save summary to file
import json
with open(REPORTS.parent / 'eda_summary_report.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print("\n💾 EDA summary report saved to reports/eda_summary_report.json")
print("\n🎉 Week 3 EDA Analysis Complete!")
print("\n📋 Next Steps (Week 4):")
print("  1. Data preprocessing and cleaning")
print("  2. Feature engineering")
print("  3. Data splitting for ML")
print("  4. Model development preparation")