In [2]:
import pandas as pd
import os
from datetime import datetime
from typing import List

def process_insurance_files(file_paths: List[str]) -> None:
    """Process multiple insurance CSV files with full reporting."""

    # Create cleaned data directory
    output_dir = "cleaned_data"
    os.makedirs(output_dir, exist_ok=True)

    # Configure logging
    log_file = f"data_cleaning_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

    with open(log_file, 'w') as log:
        log.write(f"Data Cleaning Report - {datetime.now()}\n\n")

        for file_path in file_paths:
            try:
                log.write(f"{'='*50}\n")
                log.write(f"Processing: {file_path}\n")

                # 1. Load data
                raw_df = pd.read_csv(
                    file_path,
                    na_values=["NA", "N/A", "?", "Unknown", "", "-", "NaN", "null"],
                    engine='python',
                    dtype={'Geo_Code': 'category'}  # Example type handling
                )

                # 2. Initial analysis
                log.write(f"\nInitial Shape: {raw_df.shape}\n")
                log.write("Missing Values:\n")
                log.write(raw_df.isna().sum().to_markdown())
                log.write(f"\nDuplicates: {raw_df.duplicated().sum()}\n")

                # 3. Clean data
                cleaned_df = raw_df.copy()

                # Handle missing values
                for col in cleaned_df.columns:
                    if col == 'Customer Id':
                        continue

                    if pd.api.types.is_numeric_dtype(cleaned_df[col]):
                        # Smart numeric imputation
                        if cleaned_df[col].dtype == 'int64':
                            fill_val = int(cleaned_df[col].median())
                        else:
                            fill_val = cleaned_df[col].median()
                    else:
                        fill_val = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else 'MISSING'

                    cleaned_df[col] = cleaned_df[col].fillna(fill_val)

                # Remove duplicates
                initial_count = len(cleaned_df)
                cleaned_df = cleaned_df.drop_duplicates()
                removed_count = initial_count - len(cleaned_df)

                # 4. Save results
                base_name = os.path.basename(file_path)
                output_path = os.path.join(output_dir, f"cleaned_{base_name}")
                cleaned_df.to_csv(output_path, index=False)

                # 5. Log results
                log.write(f"\nCleaning Results:\n")
                log.write(f"- Removed duplicates: {removed_count}\n")
                log.write(f"- Final shape: {cleaned_df.shape}\n")
                log.write(f"- Output file: {output_path}\n")
                log.write("Status: SUCCESS\n")

                print(f"Processed {file_path} → {output_path}")

            except Exception as e:
                log.write(f"\nERROR processing {file_path}: {str(e)}\n")
                log.write("Status: FAILED\n")
                print(f"Error processing {file_path}: {e}")

        log.write("\nBatch processing complete")

if __name__ == "__main__":
    # List of files to process (update with your paths)
    files_to_process = [
        "sample_data/home_insurance.csv",
        "sample_data/car_co2.csv",
        "sample_data/car_insurance.csv",
        "sample_data/synthetic_insurance_data.csv"
    ]

    process_insurance_files(files_to_process)
    print("\nBatch processing complete. Check generated log file.")

Processed sample_data/home_insurance.csv → cleaned_data/cleaned_home_insurance.csv
Processed sample_data/car_co2.csv → cleaned_data/cleaned_car_co2.csv
Processed sample_data/car_insurance.csv → cleaned_data/cleaned_car_insurance.csv
Processed sample_data/synthetic_insurance_data.csv → cleaned_data/cleaned_synthetic_insurance_data.csv

Batch processing complete. Check generated log file.
