# Week 2: Data Collection and Initial Exploration

This notebook covers:
1. Loading downloaded Kaggle datasets
2. Creating custom ingredient toxicity database
3. Initial data exploration
4. Data quality assessment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

# Set up paths
DATA_RAW = Path('../data/raw')
DATA_PROCESSED = Path('../data/processed')
DATA_PROCESSED.mkdir(exist_ok=True)

print("✅ Libraries imported successfully")
print(f"📁 Raw data directory: {DATA_RAW}")
print(f"📁 Processed data directory: {DATA_PROCESSED}")

In [None]:
# Check what files we have downloaded
print("📋 Files in raw data directory:")
for file in DATA_RAW.glob('*'):
    if file.is_file():
        size_mb = file.stat().st_size / (1024 * 1024)
        print(f"  📄 {file.name} ({size_mb:.1f} MB)")

In [None]:
# Create custom ingredient toxicity database
import sys
sys.path.append('../src/data')

from create_ingredient_db import create_ingredient_toxicity_db

# Create the database
ingredient_db = create_ingredient_toxicity_db()

# Display first few rows
print("\n🔍 First 10 ingredients:")
ingredient_db.head(10)

In [None]:
# Load Kaggle datasets (if available)
datasets = {}

# Try to load common food dataset files
possible_files = [
    'nutrition.csv',
    'food_nutrition.csv', 
    'recipes.csv',
    'food_data.csv',
    'ingredients.csv'
]

for filename in possible_files:
    filepath = DATA_RAW / filename
    if filepath.exists():
        try:
            df = pd.read_csv(filepath)
            datasets[filename] = df
            print(f"✅ Loaded {filename}: {df.shape}")
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")
    else:
        print(f"⚠️  {filename} not found")

print(f"\n📊 Total datasets loaded: {len(datasets)}")

In [None]:
# Explore our custom ingredient database
print("📊 Ingredient Database Analysis")
print("=" * 40)

# Basic statistics
print(f"Total ingredients: {len(ingredient_db)}")
print(f"Average toxicity score: {ingredient_db['toxicity_score'].mean():.1f}")
print(f"Median toxicity score: {ingredient_db['toxicity_score'].median():.1f}")

print("\n📈 Risk Level Distribution:")
print(ingredient_db['risk_level'].value_counts())

print("\n🏷️ Category Distribution:")
print(ingredient_db['category'].value_counts())

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Toxicity score distribution
axes[0,0].hist(ingredient_db['toxicity_score'], bins=20, alpha=0.7, color='red', edgecolor='black')
axes[0,0].set_title('Distribution of Toxicity Scores')
axes[0,0].set_xlabel('Toxicity Score (0-100)')
axes[0,0].set_ylabel('Number of Ingredients')
axes[0,0].grid(True, alpha=0.3)

# 2. Risk level pie chart
risk_counts = ingredient_db['risk_level'].value_counts()
colors = ['green', 'yellow', 'orange', 'red']
axes[0,1].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', colors=colors)
axes[0,1].set_title('Risk Level Distribution')

# 3. Category distribution
category_counts = ingredient_db['category'].value_counts()
axes[1,0].bar(range(len(category_counts)), category_counts.values)
axes[1,0].set_title('Ingredient Categories')
axes[1,0].set_xlabel('Category')
axes[1,0].set_ylabel('Count')
axes[1,0].set_xticks(range(len(category_counts)))
axes[1,0].set_xticklabels(category_counts.index, rotation=45, ha='right')

# 4. Toxicity by category boxplot
ingredient_db.boxplot(column='toxicity_score', by='risk_level', ax=axes[1,1])
axes[1,1].set_title('Toxicity Score by Risk Level')
axes[1,1].set_xlabel('Risk Level')
axes[1,1].set_ylabel('Toxicity Score')

plt.tight_layout()
plt.savefig('../reports/figures/week2_data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

print("📊 Visualizations saved to reports/figures/")

In [None]:
# Data quality assessment
print("🔍 Data Quality Assessment")
print("=" * 30)

# Check for missing values
print("Missing values:")
print(ingredient_db.isnull().sum())

# Check for duplicates
duplicates = ingredient_db.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Check data types
print("\nData types:")
print(ingredient_db.dtypes)

# Summary statistics
print("\nSummary statistics for toxicity_score:")
print(ingredient_db['toxicity_score'].describe())

In [None]:
# Save processed data
print("💾 Saving processed data...")

# Save ingredient database
ingredient_db.to_csv(DATA_PROCESSED / 'ingredient_toxicity_db.csv', index=False)
print(f"✅ Saved ingredient database: {len(ingredient_db)} rows")

# Save any loaded Kaggle datasets to processed folder
for name, df in datasets.items():
    processed_name = f"processed_{name}"
    df.to_csv(DATA_PROCESSED / processed_name, index=False)
    print(f"✅ Saved {processed_name}: {df.shape}")

print("\n🎉 Week 2 Data Collection Complete!")
print("\n📋 Next Steps (Week 3):")
print("  1. Exploratory Data Analysis (EDA)")
print("  2. Data preprocessing")
print("  3. Feature engineering")
print("  4. Data splitting for ML")