# Credit Risk Scoring Model - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the loan dataset.

## Objectives:
1. Load and inspect the dataset
2. Analyze data quality (missing values, duplicates)
3. Explore feature distributions
4. Analyze correlations and relationships
5. Identify patterns in loan defaults
6. Detect outliers
7. Assess class imbalance

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Import custom utilities
import sys
sys.path.append('../')
from src.utils import load_config, check_missing_values, detect_outliers_iqr

print("Libraries imported successfully!")

## 1. Load Dataset

In [None]:
# Load configuration
config = load_config('../config/model_config.yaml')

# Load data
data_path = '../data/raw/loan_data.csv'

try:
    df = pd.read_csv(data_path)
    print(f"✓ Data loaded successfully!")
    print(f"  Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
except FileNotFoundError:
    print("✗ Data file not found. Please download dataset to data/raw/loan_data.csv")
    print("  Recommended datasets:")
    print("  - Kaggle: Loan Prediction Dataset")
    print("  - Kaggle: Give Me Some Credit")
    df = None

## 2. Initial Data Inspection

In [None]:
if df is not None:
    # Display first few rows
    print("First 5 rows:")
    display(df.head())
    
    # Display last few rows
    print("\nLast 5 rows:")
    display(df.tail())

In [None]:
if df is not None:
    # Dataset info
    print("Dataset Information:")
    df.info()

In [None]:
if df is not None:
    # Statistical summary
    print("Statistical Summary:")
    display(df.describe())

In [None]:
if df is not None:
    # Data types
    print("Data Types:")
    print(df.dtypes)
    
    print("\nNumeric columns:", df.select_dtypes(include=[np.number]).columns.tolist())
    print("Categorical columns:", df.select_dtypes(include=['object', 'category']).columns.tolist())

## 3. Data Quality Assessment

In [None]:
if df is not None:
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows: {duplicates}")
    
    if duplicates > 0:
        print(f"Percentage of duplicates: {duplicates/len(df)*100:.2f}%")

In [None]:
if df is not None:
    # Missing values analysis
    missing_summary = check_missing_values(df)
    
    if not missing_summary.empty:
        print("Missing Values Summary:")
        display(missing_summary)
        
        # Visualize missing values
        fig, ax = plt.subplots(figsize=(10, 6))
        missing_summary.plot(x='Column', y='Missing_Percentage', kind='bar', ax=ax, color='coral')
        ax.set_title('Missing Values by Column', fontsize=14, fontweight='bold')
        ax.set_xlabel('Column', fontsize=12)
        ax.set_ylabel('Missing Percentage (%)', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("✓ No missing values found!")

## 4. Target Variable Analysis

In [None]:
if df is not None and 'loan_status' in df.columns:
    # Target distribution
    target_counts = df['loan_status'].value_counts()
    print("Loan Status Distribution:")
    print(target_counts)
    print(f"\nDefault rate: {target_counts.get(1, 0) / len(df) * 100:.2f}%")
    
    # Visualize target distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar plot
    target_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'])
    axes[0].set_title('Loan Status Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Loan Status (0=Paid, 1=Default)', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].tick_params(axis='x', rotation=0)
    
    # Pie chart
    axes[1].pie(target_counts.values, labels=['Paid (0)', 'Default (1)'], 
                autopct='%1.1f%%', colors=['green', 'red'], startangle=90)
    axes[1].set_title('Loan Status Proportion', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Class imbalance assessment
    imbalance_ratio = target_counts.max() / target_counts.min()
    print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")
    
    if imbalance_ratio > 3:
        print("⚠ Significant class imbalance detected. Consider using SMOTE or class weights.")

## 5. Numeric Feature Distributions

In [None]:
if df is not None:
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'loan_status' in numeric_cols:
        numeric_cols.remove('loan_status')
    
    # Plot distributions
    n_cols = 3
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*4))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numeric_cols):
        if idx < len(axes):
            axes[idx].hist(df[col].dropna(), bins=50, color='skyblue', edgecolor='black')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col, fontsize=10)
            axes[idx].set_ylabel('Frequency', fontsize=10)
            axes[idx].grid(alpha=0.3)
    
    # Hide unused subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 6. Categorical Feature Analysis

In [None]:
if df is not None:
    # Get categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if categorical_cols:
        n_cols = 2
        n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows*4))
        axes = axes.flatten() if n_rows > 1 else [axes]
        
        for idx, col in enumerate(categorical_cols):
            if idx < len(axes):
                value_counts = df[col].value_counts()
                axes[idx].bar(range(len(value_counts)), value_counts.values, color='coral')
                axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
                axes[idx].set_xlabel(col, fontsize=10)
                axes[idx].set_ylabel('Count', fontsize=10)
                axes[idx].set_xticks(range(len(value_counts)))
                axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
                axes[idx].grid(axis='y', alpha=0.3)
        
        # Hide unused subplots
        for idx in range(len(categorical_cols), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No categorical columns found.")

## 7. Correlation Analysis

In [None]:
if df is not None:
    # Compute correlation matrix
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr()
    
    # Plot correlation heatmap
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
    ax.set_title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated features
    print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.7:
                high_corr.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr:
        for feat1, feat2, corr in high_corr:
            print(f"  {feat1} <-> {feat2}: {corr:.3f}")
    else:
        print("  None found.")

## 8. Target vs. Features Analysis

In [None]:
if df is not None and 'loan_status' in df.columns:
    # Select key numeric features
    key_features = ['person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income']
    available_features = [f for f in key_features if f in df.columns]
    
    if available_features:
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        axes = axes.flatten()
        
        for idx, feature in enumerate(available_features[:4]):
            if idx < len(axes):
                df.boxplot(column=feature, by='loan_status', ax=axes[idx])
                axes[idx].set_title(f'{feature} by Loan Status', fontsize=12, fontweight='bold')
                axes[idx].set_xlabel('Loan Status (0=Paid, 1=Default)', fontsize=10)
                axes[idx].set_ylabel(feature, fontsize=10)
                axes[idx].get_figure().suptitle('')  # Remove automatic title
        
        plt.tight_layout()
        plt.show()

## 9. Outlier Detection

In [None]:
if df is not None:
    # Detect outliers using IQR method
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'loan_status' in numeric_cols:
        numeric_cols.remove('loan_status')
    
    outlier_counts = detect_outliers_iqr(df, numeric_cols, threshold=1.5)
    
    # Create summary DataFrame
    outlier_summary = pd.DataFrame({
        'Feature': list(outlier_counts.keys()),
        'Outlier_Count': list(outlier_counts.values()),
        'Outlier_Percentage': [count/len(df)*100 for count in outlier_counts.values()]
    }).sort_values(by='Outlier_Percentage', ascending=False)
    
    print("Outlier Summary (IQR method):")
    display(outlier_summary)
    
    # Visualize outlier percentages
    if not outlier_summary.empty:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(outlier_summary['Feature'], outlier_summary['Outlier_Percentage'], color='orange')
        ax.set_xlabel('Outlier Percentage (%)', fontsize=12)
        ax.set_ylabel('Feature', fontsize=12)
        ax.set_title('Outlier Percentage by Feature', fontsize=14, fontweight='bold')
        ax.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()

## 10. Key Insights Summary

In [None]:
if df is not None:
    print("="*60)
    print("KEY INSIGHTS FROM EDA")
    print("="*60)
    
    print(f"\n1. Dataset Size: {len(df):,} rows, {len(df.columns)} columns")
    
    # Missing values
    missing_summary = check_missing_values(df)
    if not missing_summary.empty:
        print(f"\n2. Missing Values: Found in {len(missing_summary)} columns")
        print(f"   Action needed: Imputation or removal")
    else:
        print(f"\n2. Missing Values: None ✓")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"\n3. Duplicates: {duplicates} rows ({duplicates/len(df)*100:.2f}%)")
    else:
        print(f"\n3. Duplicates: None ✓")
    
    # Class imbalance
    if 'loan_status' in df.columns:
        target_counts = df['loan_status'].value_counts()
        default_rate = target_counts.get(1, 0) / len(df) * 100
        print(f"\n4. Default Rate: {default_rate:.2f}%")
        
        imbalance_ratio = target_counts.max() / target_counts.min()
        print(f"   Class imbalance: {imbalance_ratio:.2f}:1")
        if imbalance_ratio > 3:
            print("   Recommendation: Apply SMOTE or class weights")
    
    # Outliers
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'loan_status' in numeric_cols:
        numeric_cols.remove('loan_status')
    outlier_counts = detect_outliers_iqr(df, numeric_cols, threshold=1.5)
    total_outliers = sum(outlier_counts.values())
    print(f"\n5. Outliers: {total_outliers} detected across {len(numeric_cols)} features")
    print("   Recommendation: Cap outliers at IQR boundaries")
    
    print("\n" + "="*60)
    print("NEXT STEPS:")
    print("="*60)
    print("1. Run data preprocessing (src/data_preprocessing.py)")
    print("2. Engineer risk tiers (Low, Medium, High)")
    print("3. Apply feature engineering and selection")
    print("4. Train and evaluate models")
    print("="*60)

## 11. Save EDA Report

In [None]:
if df is not None:
    # Create EDA report
    report = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'numeric_features': len(df.select_dtypes(include=[np.number]).columns),
        'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns),
        'missing_values': df.isnull().sum().sum(),
        'duplicate_rows': df.duplicated().sum(),
    }
    
    if 'loan_status' in df.columns:
        target_counts = df['loan_status'].value_counts()
        report['default_rate'] = target_counts.get(1, 0) / len(df) * 100
        report['class_imbalance_ratio'] = target_counts.max() / target_counts.min()
    
    # Save report
    report_df = pd.DataFrame([report])
    report_path = '../outputs/reports/eda_summary.csv'
    report_df.to_csv(report_path, index=False)
    
    print(f"✓ EDA report saved to: {report_path}")