In [None]:
"""
# ðŸ§¹ Data Cleaning Pipeline

Clean and prepare anime dataset for modeling
"""

# ## 1. Setup
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

from src.data_cleaner import AnimeDataCleaner, clean_anime_data
from config import ANIME_RAW, ANIME_CLEANED

# ## 2. Load Raw Data
anime_df = pd.read_csv(ANIME_RAW)
print(f"Original shape: {anime_df.shape}")

# ## 3. Initialize Cleaner
cleaner = AnimeDataCleaner()

# ## 4. Handle Missing Values
print("\n=== Handling Missing Values ===")
anime_df = cleaner.handle_missing_values(anime_df)
print(f"Shape after: {anime_df.shape}")

# ## 5. Remove Duplicates
print("\n=== Removing Duplicates ===")
anime_df = cleaner.remove_duplicates(anime_df)
print(f"Shape after: {anime_df.shape}")

# ## 6. Normalize Data
print("\n=== Normalizing Data ===")
anime_df = cleaner.normalize_data(anime_df)
print(f"Shape after: {anime_df.shape}")

# ## 7. Remove Outliers
print("\n=== Removing Outliers ===")
anime_df = cleaner.remove_outliers(anime_df)
print(f"Final shape: {anime_df.shape}")

# ## 8. Save Cleaned Data
anime_df.to_csv(ANIME_CLEANED, index=False)
print(f"\nâœ… Cleaned data saved to {ANIME_CLEANED}")

# ## 9. Cleaning Report
report = cleaner.get_cleaning_report()
print("\nðŸ“Š Cleaning Report:")
for task, stats in report.items():
    print(f"\n{task.upper()}:")
    for key, value in stats.items():
        print(f"  {key}: {value}")