In [None]:
# ==============================================================================
# FILE: donations_eda.py
# PURPOSE: Exploratory Data Analysis for DONATIONS table
# AUTHOR: Greg Sullivan
# DATE: January 2026
# DESCRIPTION: Comprehensive EDA to identify data quality issues, patterns,
#              and anomalies in the donations dataset before conducting formal
#              SQL-based data quality analysis.
# ==============================================================================

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# ==============================================================================
# SECTION 1: Data Loading and Initial Inspection
# ==============================================================================

def load_and_inspect_data(file_path):
    """
    Load the donations CSV file and perform initial inspection.
    
    NOTE: Uses keep_default_na=False to prevent Pandas from automatically
    converting 'N/A' strings to NaN values, which matches Snowflake's behavior.
    
    Parameters:
    -----------
    file_path : str
        Path to the donations.csv file
    
    Returns:
    --------
    pd.DataFrame
        Loaded donations data
    """
    print("=" * 80)
    print("SECTION 1: DATA LOADING AND INITIAL INSPECTION")
    print("=" * 80)
    
    # Load data - prevent automatic N/A conversion to match Snowflake behavior
    df = pd.read_csv(file_path, keep_default_na=False, na_values=[''])
    
    print(f"\n1.1 Dataset Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    
    print("\n1.2 Column Names and Data Types:")
    print(df.dtypes)
    
    print("\n1.3 First 10 Records:")
    print(df.head(10))
    
    print("\n1.4 Last 10 Records:")
    print(df.tail(10))
    
    print("\n1.5 Random Sample of 10 Records:")
    print(df.sample(10, random_state=42))
    
    print("\n1.6 Basic Info:")
    print(df.info())
    
    return df


# ==============================================================================
# SECTION 2: Missing Data Analysis
# ==============================================================================

def analyze_missing_data(df):
    """
    Comprehensive analysis of missing and null values.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 2: MISSING DATA ANALYSIS")
    print("=" * 80)
    
    # Count missing values
    missing_counts = df.isnull().sum()
    missing_percentages = (df.isnull().sum() / len(df) * 100).round(2)
    
    missing_df = (
        pd.DataFrame({
            'column': missing_counts.index,
            'missing_count': missing_counts.values,
            'missing_percentage': missing_percentages.values
        })
        .sort_values('missing_count', ascending=False)
        .reset_index(drop=True)
    )
    
    print("\n2.1 Missing Values Summary:")
    print(missing_df)
    
    # Check for empty strings or whitespace-only values
    print("\n2.2 Empty String or Whitespace-Only Values:")
    for col in df.select_dtypes(include=['object']).columns:
        empty_count = (df[col].str.strip() == '').sum()
        whitespace_count = df[col].str.isspace().sum() if df[col].dtype == 'object' else 0
        
        if empty_count > 0 or whitespace_count > 0:
            print(f"   {col}:")
            print(f"      - Empty strings: {empty_count}")
            print(f"      - Whitespace only: {whitespace_count}")
    
    # Visualize missing data
    if missing_counts.sum() > 0:
        plt.figure(figsize=(10, 6))
        missing_df_plot = missing_df[missing_df['missing_count'] > 0]
        
        if not missing_df_plot.empty:
            plt.barh(missing_df_plot['column'], missing_df_plot['missing_percentage'])
            plt.xlabel('Missing Percentage (%)')
            plt.title('Missing Data by Column')
            plt.tight_layout()
            plt.show()


# ==============================================================================
# SECTION 3: Descriptive Statistics
# ==============================================================================

def generate_descriptive_statistics(df):
    """
    Generate comprehensive descriptive statistics for all columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 3: DESCRIPTIVE STATISTICS")
    print("=" * 80)
    
    # Numeric columns
    print("\n3.1 Numeric Column Statistics:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(df[numeric_cols].describe())
    
    # Categorical columns
    print("\n3.2 Categorical Column Statistics:")
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    for col in categorical_cols:
        print(f"\n   {col}:")
        print(f"      - Unique values: {df[col].nunique()}")
        print(f"      - Most common: {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")
        print(f"      - Value counts (top 10):")
        print(df[col].value_counts().head(10))


# ==============================================================================
# SECTION 4: Data Quality Issues - Name Format Analysis
# ==============================================================================

def analyze_name_formats(df):
    """
    Analyze NAME column for format inconsistencies.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 4: NAME FORMAT ANALYSIS")
    print("=" * 80)
    
    # Check for comma presence (Last, First format)
    df_analysis = df.copy()
    df_analysis['has_comma'] = df_analysis['NAME'].str.contains(',', na=False)
    
    comma_count = df_analysis['has_comma'].sum()
    no_comma_count = len(df_analysis) - comma_count
    
    print(f"\n4.1 Name Format Distribution:")
    print(f"   - Names with comma (Last, First): {comma_count} ({comma_count/len(df)*100:.1f}%)")
    print(f"   - Names without comma (First Last): {no_comma_count} ({no_comma_count/len(df)*100:.1f}%)")
    
    print(f"\n4.2 Sample Names WITH Comma:")
    print(df_analysis[df_analysis['has_comma']]['NAME'].head(10).tolist())
    
    print(f"\n4.3 Sample Names WITHOUT Comma:")
    print(df_analysis[~df_analysis['has_comma']]['NAME'].head(10).tolist())
    
    # Name length analysis
    df_analysis['name_length'] = df_analysis['NAME'].str.len()
    
    print(f"\n4.4 Name Length Statistics:")
    print(df_analysis['name_length'].describe())
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Pie chart for format distribution
    axes[0].pie(
        [comma_count, no_comma_count],
        labels=['Last, First', 'First Last'],
        autopct='%1.1f%%',
        startangle=90
    )
    axes[0].set_title('Name Format Distribution')
    
    # Histogram of name lengths
    axes[1].hist(df_analysis['name_length'], bins=20, edgecolor='black')
    axes[1].set_xlabel('Name Length')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Distribution of Name Lengths')
    
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 5: Data Quality Issues - Category Analysis
# ==============================================================================

def analyze_categories(df):
    """
    Analyze CATEGORY column for data quality issues.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 5: CATEGORY ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    # Clean and analyze
    print("\n5.1 Category Value Counts:")
    print(df_analysis['CATEGORY'].value_counts(dropna=False))
    
    # Check for blank/empty categories (matching Snowflake's TRIM behavior)
    # Snowflake counts: NULL OR TRIM(category) = ''
    blank_or_null = df_analysis['CATEGORY'].isnull() | (df_analysis['CATEGORY'].str.strip() == '')
    blank_count = blank_or_null.sum()
    
    print(f"\n5.2 Missing/Blank Category Analysis (matches Snowflake):")
    print(f"   - NULL or empty after TRIM: {blank_count} ({blank_count/len(df)*100:.1f}%)")
    
    # Placeholder values - now correctly identifying 'N/A' as a string
    na_count = (df_analysis['CATEGORY'] == 'N/A').sum()
    unknown_count = (df_analysis['CATEGORY'] == 'Unknown').sum()
    placeholder_count = na_count + unknown_count
    
    print(f"\n5.3 Placeholder Values:")
    print(f"   - 'N/A': {na_count}")
    print(f"   - 'Unknown': {unknown_count}")
    print(f"   - Total placeholders: {placeholder_count} ({placeholder_count/len(df)*100:.1f}%)")
    
    # Total problematic
    total_problematic = blank_count + placeholder_count
    print(f"\n5.4 Total Problematic Categories:")
    print(f"   - Missing/blank + placeholders: {total_problematic} ({total_problematic/len(df)*100:.1f}%)")
    
    # Check for multi-word categories (inconsistent naming)
    df_analysis['has_space'] = df_analysis['CATEGORY'].str.contains(' ', na=False)
    multi_word_count = df_analysis['has_space'].sum()
    
    print(f"\n5.5 Category Naming Convention:")
    print(f"   - Multi-word categories: {multi_word_count}")
    print(f"   - Single-word categories: {len(df_analysis) - multi_word_count - blank_count}")
    
    # Visualize
    category_counts = df_analysis['CATEGORY'].value_counts().head(10)
    
    plt.figure(figsize=(12, 6))
    plt.barh(category_counts.index, category_counts.values)
    plt.xlabel('Count')
    plt.title('Top 10 Categories by Frequency')
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 6: Data Quality Issues - Age and Date of Birth Analysis
# ==============================================================================

def analyze_age_dob(df):
    """
    Analyze AGE and DATE_OF_BIRTH columns for mismatches.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 6: AGE AND DATE OF BIRTH ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    # Parse dates
    df_analysis['dob_parsed'] = pd.to_datetime(
        df_analysis['DATE_OF_BIRTH'], 
        format='%m/%d/%y',
        errors='coerce'
    )
    
    # Handle 2-digit year ambiguity (assume >24 means 1900s, <=24 means 2000s)
    current_year = 2024
    df_analysis['birth_year'] = df_analysis['dob_parsed'].dt.year
    df_analysis['birth_year_adjusted'] = df_analysis['birth_year'].apply(
        lambda x: x - 100 if x > current_year else x
    )
    
    # Calculate age from DOB
    df_analysis['calculated_age'] = current_year - df_analysis['birth_year_adjusted']
    
    # Calculate difference
    df_analysis['age_difference'] = abs(df_analysis['AGE'] - df_analysis['calculated_age'])
    
    print("\n6.1 Age Statistics:")
    print(df_analysis['AGE'].describe())
    
    print("\n6.2 Calculated Age Statistics:")
    print(df_analysis['calculated_age'].describe())
    
    print("\n6.3 Age Difference Statistics:")
    print(df_analysis['age_difference'].describe())
    
    # Mismatches (>5 years difference)
    mismatched = df_analysis[df_analysis['age_difference'] > 5]
    print(f"\n6.4 Age/DOB Mismatches (>5 years difference):")
    print(f"   - Count: {len(mismatched)} ({len(mismatched)/len(df)*100:.1f}%)")
    
    print(f"\n6.5 Sample Mismatched Records:")
    print(mismatched[['DONATION_ID', 'NAME', 'AGE', 'DATE_OF_BIRTH', 'birth_year_adjusted', 
                      'calculated_age', 'age_difference']].head(10))
    
    # Check for round ages
    round_ages = df_analysis[df_analysis['AGE'] % 5 == 0]
    print(f"\n6.6 Round Ages (multiples of 5):")
    print(f"   - Count: {len(round_ages)} ({len(round_ages)/len(df)*100:.1f}%)")
    print(f"   - This suggests possible age estimation rather than calculation")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Age distribution
    axes[0, 0].hist(df_analysis['AGE'], bins=20, edgecolor='black')
    axes[0, 0].set_xlabel('Age')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution of Reported Ages')
    
    # Calculated age distribution
    axes[0, 1].hist(df_analysis['calculated_age'].dropna(), bins=20, edgecolor='black')
    axes[0, 1].set_xlabel('Calculated Age')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Distribution of Calculated Ages (from DOB)')
    
    # Age difference
    axes[1, 0].hist(df_analysis['age_difference'].dropna(), bins=30, edgecolor='black')
    axes[1, 0].axvline(x=5, color='red', linestyle='--', label='5-year threshold')
    axes[1, 0].set_xlabel('Age Difference (years)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Distribution of Age/DOB Differences')
    axes[1, 0].legend()
    
    # Scatter: Reported vs Calculated
    axes[1, 1].scatter(df_analysis['calculated_age'], df_analysis['AGE'], alpha=0.5)
    axes[1, 1].plot([0, 100], [0, 100], 'r--', label='Perfect match')
    axes[1, 1].set_xlabel('Calculated Age')
    axes[1, 1].set_ylabel('Reported Age')
    axes[1, 1].set_title('Reported Age vs Calculated Age')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 7: Data Quality Issues - Phone Number Analysis
# ==============================================================================

def analyze_phone_numbers(df):
    """
    Analyze PHONE column for format inconsistencies.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 7: PHONE NUMBER ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    # Extract digits only
    df_analysis['phone_digits'] = df_analysis['PHONE'].str.replace(r'\D', '', regex=True)
    df_analysis['phone_digit_count'] = df_analysis['phone_digits'].str.len()
    df_analysis['phone_length'] = df_analysis['PHONE'].str.len()
    
    print("\n7.1 Phone Number Length Statistics:")
    print(df_analysis['phone_length'].describe())
    
    print("\n7.2 Digit Count Statistics:")
    print(df_analysis['phone_digit_count'].describe())
    
    print("\n7.3 Digit Count Distribution:")
    print(df_analysis['phone_digit_count'].value_counts().sort_index())
    
    # Identify format patterns
    df_analysis['has_parentheses'] = df_analysis['PHONE'].str.contains(r'\(', na=False)
    df_analysis['has_dash'] = df_analysis['PHONE'].str.contains('-', na=False)
    df_analysis['has_dot'] = df_analysis['PHONE'].str.contains(r'\.', na=False)
    df_analysis['has_extension'] = df_analysis['PHONE'].str.contains('x', na=False)
    df_analysis['has_plus'] = df_analysis['PHONE'].str.contains(r'\+', na=False)
    
    print("\n7.4 Phone Format Patterns:")
    print(f"   - With parentheses: {df_analysis['has_parentheses'].sum()}")
    print(f"   - With dashes: {df_analysis['has_dash'].sum()}")
    print(f"   - With dots: {df_analysis['has_dot'].sum()}")
    print(f"   - With extensions: {df_analysis['has_extension'].sum()}")
    print(f"   - With plus sign (intl): {df_analysis['has_plus'].sum()}")
    
    # Invalid phone numbers (not 10 digits)
    invalid_phones = df_analysis[df_analysis['phone_digit_count'] != 10]
    print(f"\n7.5 Invalid Phone Numbers (not 10 digits):")
    print(f"   - Count: {len(invalid_phones)} ({len(invalid_phones)/len(df)*100:.1f}%)")
    print(f"\n   Sample Invalid Phone Numbers:")
    print(invalid_phones[['DONATION_ID', 'NAME', 'PHONE', 'phone_digit_count']].head(10))
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Digit count distribution
    axes[0].hist(df_analysis['phone_digit_count'], bins=range(0, 15), edgecolor='black')
    axes[0].axvline(x=10, color='red', linestyle='--', label='Expected (10 digits)')
    axes[0].set_xlabel('Number of Digits')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Phone Number Digit Counts')
    axes[0].legend()
    
    # Format patterns
    format_counts = pd.Series({
        'Parentheses': df_analysis['has_parentheses'].sum(),
        'Dashes': df_analysis['has_dash'].sum(),
        'Dots': df_analysis['has_dot'].sum(),
        'Extensions': df_analysis['has_extension'].sum(),
        'Plus/Intl': df_analysis['has_plus'].sum()
    })
    
    axes[1].barh(format_counts.index, format_counts.values)
    axes[1].set_xlabel('Count')
    axes[1].set_title('Phone Number Format Patterns')
    
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 8: Data Quality Issues - Donation Amount Analysis
# ==============================================================================

def analyze_donation_amounts(df):
    """
    Analyze AMOUNT column for outliers and patterns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 8: DONATION AMOUNT ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    print("\n8.1 Amount Statistics:")
    print(df_analysis['AMOUNT'].describe())
    
    print("\n8.2 Percentile Analysis:")
    percentiles = [0.5, 0.75, 0.90, 0.95, 0.99, 1.0]
    for p in percentiles:
        value = df_analysis['AMOUNT'].quantile(p)
        print(f"   - {p*100:.0f}th percentile: ${value:,.2f}")
    
    # Outliers (>$1M)
    outliers = df_analysis[df_analysis['AMOUNT'] > 1000000]
    print(f"\n8.3 Extreme Outliers (>$1,000,000):")
    print(f"   - Count: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
    print(f"   - Total value: ${outliers['AMOUNT'].sum():,.2f}")
    print(f"\n   Outlier Records:")
    print(outliers[['DONATION_ID', 'NAME', 'ORGANIZATION', 'AMOUNT']].sort_values('AMOUNT', ascending=False))
    
    # Impact on statistics
    total_with_outliers = df_analysis['AMOUNT'].sum()
    total_without_outliers = df_analysis[df_analysis['AMOUNT'] <= 1000000]['AMOUNT'].sum()
    
    print(f"\n8.4 Impact of Outliers:")
    print(f"   - Total donations (with outliers): ${total_with_outliers:,.2f}")
    print(f"   - Total donations (without outliers): ${total_without_outliers:,.2f}")
    print(f"   - Outliers represent {(total_with_outliers - total_without_outliers)/total_with_outliers*100:.1f}% of total value")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Full distribution (with outliers)
    axes[0, 0].hist(df_analysis['AMOUNT'], bins=50, edgecolor='black')
    axes[0, 0].set_xlabel('Amount ($)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Donation Amount Distribution (Full Range)')
    
    # Distribution without outliers
    normal_range = df_analysis[df_analysis['AMOUNT'] <= 1000]
    axes[0, 1].hist(normal_range['AMOUNT'], bins=50, edgecolor='black')
    axes[0, 1].set_xlabel('Amount ($)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Donation Amount Distribution (≤$1,000)')
    
    # Box plot
    axes[1, 0].boxplot(df_analysis['AMOUNT'], vert=False)
    axes[1, 0].set_xlabel('Amount ($)')
    axes[1, 0].set_title('Box Plot of Donation Amounts')
    
    # Log scale histogram
    axes[1, 1].hist(np.log10(df_analysis['AMOUNT'] + 1), bins=50, edgecolor='black')
    axes[1, 1].set_xlabel('log10(Amount + 1)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Log-Scale Distribution of Donation Amounts')
    
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 9: Data Quality Issues - ZIP Code Analysis
# ==============================================================================

def analyze_zip_codes(df):
    """
    Analyze ZIP column for incomplete codes.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 9: ZIP CODE ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    # Convert to string and get length
    df_analysis['zip_str'] = df_analysis['ZIP'].astype(str)
    df_analysis['zip_length'] = df_analysis['zip_str'].str.len()
    
    print("\n9.1 ZIP Code Length Distribution:")
    print(df_analysis['zip_length'].value_counts().sort_index())
    
    # Incomplete ZIPs
    incomplete_zips = df_analysis[df_analysis['zip_length'] < 5]
    print(f"\n9.2 Incomplete ZIP Codes (<5 digits):")
    print(f"   - Count: {len(incomplete_zips)} ({len(incomplete_zips)/len(df)*100:.1f}%)")
    print(f"\n   Sample Incomplete ZIPs:")
    print(incomplete_zips[['DONATION_ID', 'NAME', 'CITY', 'STATE', 'ZIP', 'zip_length']].head(10))
    
    # State distribution
    print("\n9.3 Records by State (Top 10):")
    print(df_analysis['STATE'].value_counts().head(10))
    
    # Visualize
    plt.figure(figsize=(10, 6))
    df_analysis['zip_length'].value_counts().sort_index().plot(kind='bar', edgecolor='black')
    plt.xlabel('ZIP Code Length')
    plt.ylabel('Count')
    plt.title('Distribution of ZIP Code Lengths')
    plt.axvline(x=4.5, color='red', linestyle='--', label='Expected (5 digits)')
    plt.legend()
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 10: Data Quality Issues - Street Address Analysis
# ==============================================================================

def analyze_street_addresses(df):
    """
    Analyze STREET_ADDRESS for excessive detail or anomalies.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 10: STREET ADDRESS ANALYSIS")
    print("=" * 80)
    
    df_analysis = df.copy()
    
    df_analysis['address_length'] = df_analysis['STREET_ADDRESS'].str.len()
    
    print("\n10.1 Address Length Statistics:")
    print(df_analysis['address_length'].describe())
    
    # Long addresses
    long_addresses = df_analysis[df_analysis['address_length'] > 50]
    print(f"\n10.2 Excessively Long Addresses (>50 characters):")
    print(f"   - Count: {len(long_addresses)} ({len(long_addresses)/len(df)*100:.1f}%)")
    print(f"\n   Sample Long Addresses:")
    print(long_addresses[['DONATION_ID', 'NAME', 'STREET_ADDRESS', 'address_length']].head(10))
    
    # Check for common patterns
    df_analysis['has_apt'] = df_analysis['STREET_ADDRESS'].str.contains('Apt', na=False)
    df_analysis['has_suite'] = df_analysis['STREET_ADDRESS'].str.contains('Suite', na=False)
    
    print(f"\n10.3 Address Patterns:")
    print(f"   - With 'Apt': {df_analysis['has_apt'].sum()}")
    print(f"   - With 'Suite': {df_analysis['has_suite'].sum()}")
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.hist(df_analysis['address_length'], bins=30, edgecolor='black')
    plt.axvline(x=50, color='red', linestyle='--', label='Threshold (50 chars)')
    plt.xlabel('Address Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Street Address Lengths')
    plt.legend()
    plt.tight_layout()
    plt.show()


# ==============================================================================
# SECTION 11: Overall Data Quality Summary
# ==============================================================================

def generate_overall_summary(df):
    """
    Generate comprehensive data quality summary report.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The donations dataframe
    """
    print("\n" + "=" * 80)
    print("SECTION 11: OVERALL DATA QUALITY SUMMARY")
    print("=" * 80)
    
    total_records = len(df)
    
    # Calculate all DQ issues
    dq_issues = {
        'Reversed Names (Last, First)': df['NAME'].str.contains(',', na=False).sum(),
        'Missing/Blank Categories': (df['CATEGORY'].isnull() | (df['CATEGORY'].str.strip() == '')).sum(),
        'Placeholder N/A': (df['CATEGORY'] == 'N/A').sum(),
        'Placeholder Unknown': (df['CATEGORY'] == 'Unknown').sum(),
        'Age/DOB Mismatch (>5 years)': None,  # Calculated separately
        'Invalid Phone Format': None,  # Calculated separately
        'Outlier Amounts (>$1M)': (df['AMOUNT'] > 1000000).sum(),
        'Incomplete ZIP Codes': (df['ZIP'].astype(str).str.len() < 5).sum(),
        'Long Addresses (>50 chars)': (df['STREET_ADDRESS'].str.len() > 50).sum(),
    }
    
    # Age/DOB mismatch calculation
    df_temp = df.copy()
    df_temp['dob_parsed'] = pd.to_datetime(df_temp['DATE_OF_BIRTH'], format='%m/%d/%y', errors='coerce')
    df_temp['birth_year'] = df_temp['dob_parsed'].dt.year
    df_temp['birth_year_adjusted'] = df_temp['birth_year'].apply(lambda x: x - 100 if x > 2024 else x)
    df_temp['calculated_age'] = 2024 - df_temp['birth_year_adjusted']
    df_temp['age_difference'] = abs(df_temp['AGE'] - df_temp['calculated_age'])
    dq_issues['Age/DOB Mismatch (>5 years)'] = (df_temp['age_difference'] > 5).sum()
    
    # Phone format calculation
    df_temp['phone_digits'] = df_temp['PHONE'].str.replace(r'\D', '', regex=True).str.len()
    dq_issues['Invalid Phone Format'] = (df_temp['phone_digits'] != 10).sum()
    
    print("\n11.1 Data Quality Issues Summary:")
    print(f"\n{'Issue':<40} {'Count':<10} {'Percentage'}")
    print("-" * 60)
    
    for issue, count in sorted(dq_issues.items(), key=lambda x: x[1], reverse=True):
        pct = (count / total_records * 100)
        print(f"{issue:<40} {count:<10} {pct:>6.1f}%")
    
    print("\n" + "=" * 80)
    print(f"TOTAL RECORDS: {total_records}")
    print("=" * 80)
    
    # Create summary visualization
    plt.figure(figsize=(12, 8))
    issues_df = pd.DataFrame(list(dq_issues.items()), columns=['Issue', 'Count'])
    issues_df = issues_df.sort_values('Count', ascending=True)
    
    plt.barh(issues_df['Issue'], issues_df['Count'])
    plt.xlabel('Number of Records Affected')
    plt.title('Data Quality Issues - Records Affected by Issue Type')
    plt.tight_layout()
    plt.show()


# ==============================================================================
# MAIN EXECUTION
# ==============================================================================

def main():
    """
    Main execution function that runs all EDA sections.
    """
    # File path
    file_path = 'donations.csv'
    
    # Load data
    df = load_and_inspect_data(file_path)
    
    # Run all analyses
    analyze_missing_data(df)
    generate_descriptive_statistics(df)
    analyze_name_formats(df)
    analyze_categories(df)
    analyze_age_dob(df)
    analyze_phone_numbers(df)
    analyze_donation_amounts(df)
    analyze_zip_codes(df)
    analyze_street_addresses(df)
    generate_overall_summary(df)
    
    print("\n" + "=" * 80)
    print("EDA COMPLETE")
    print("=" * 80)


if __name__ == "__main__":
    main()
