# Credit Risk - Exploratory Data Analysis (EDA)

This notebook provides an exploratory data analysis of the credit risk dataset.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add parent directory to path to import src modules
sys.path.append('..')

from src.config_loader import load_config, get_absolute_path
from src.data_loader import DataLoader

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Setup complete!")

## 2. Load Configuration and Data

In [None]:
# Load configuration
config = load_config()

# Initialize data loader
loader = DataLoader(config)

# Load data
try:
    data = loader.load_data()
    print(f"Data loaded successfully! Shape: {data.shape}")
except FileNotFoundError:
    print("Error: Please place 'base_historica.csv' in the data/raw/ directory")
    data = None

## 3. Basic Data Information

In [None]:
if data is not None:
    print("Dataset Shape:", data.shape)
    print("\nColumn Names and Types:")
    print(data.dtypes)
    print("\nFirst few rows:")
    display(data.head())

In [None]:
if data is not None:
    print("Statistical Summary:")
    display(data.describe())

## 4. Missing Values Analysis

In [None]:
if data is not None:
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(data)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Percentage': missing_percentage
    }).sort_values('Percentage', ascending=False)
    
    print("Missing Values Summary:")
    display(missing_df[missing_df['Missing Count'] > 0])
    
    # Visualize missing values
    if missing_df['Missing Count'].sum() > 0:
        plt.figure(figsize=(10, 6))
        missing_data = missing_df[missing_df['Missing Count'] > 0]
        plt.barh(missing_data.index, missing_data['Percentage'])
        plt.xlabel('Percentage of Missing Values')
        plt.title('Missing Values by Column')
        plt.tight_layout()
        plt.show()
    else:
        print("No missing values found!")

## 5. Target Variable Distribution

In [None]:
if data is not None:
    # Try to identify target column
    target_col = config['features'].get('target_column', 'default')
    
    if target_col in data.columns:
        print(f"Target Variable: {target_col}")
        print("\nValue Counts:")
        print(data[target_col].value_counts())
        print("\nPercentage Distribution:")
        print(data[target_col].value_counts(normalize=True) * 100)
        
        # Visualize target distribution
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Count plot
        data[target_col].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
        axes[0].set_title(f'Distribution of {target_col}')
        axes[0].set_ylabel('Count')
        axes[0].set_xlabel(target_col)
        
        # Pie chart
        data[target_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
        axes[1].set_title(f'Proportion of {target_col}')
        axes[1].set_ylabel('')
        
        plt.tight_layout()
        plt.show()
    else:
        print(f"Target column '{target_col}' not found in dataset.")
        print("Available columns:", list(data.columns))

## 6. Numerical Features Analysis

In [None]:
if data is not None:
    # Identify numerical columns
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"Number of numerical features: {len(numerical_cols)}")
    print("Numerical features:", numerical_cols)
    
    if len(numerical_cols) > 0:
        # Distribution plots
        n_cols = 3
        n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
        axes = axes.flatten() if n_rows > 1 else [axes]
        
        for idx, col in enumerate(numerical_cols):
            if idx < len(axes):
                data[col].hist(bins=30, ax=axes[idx], edgecolor='black')
                axes[idx].set_title(f'Distribution of {col}')
                axes[idx].set_xlabel(col)
                axes[idx].set_ylabel('Frequency')
        
        # Hide extra subplots
        for idx in range(len(numerical_cols), len(axes)):
            axes[idx].set_visible(False)
        
        plt.tight_layout()
        plt.show()

## 7. Correlation Analysis

In [None]:
if data is not None and len(numerical_cols) > 0:
    # Compute correlation matrix
    correlation = data[numerical_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Heatmap of Numerical Features')
    plt.tight_layout()
    plt.show()
    
    # Show top correlations with target (if exists)
    target_col = config['features'].get('target_column', 'default')
    if target_col in correlation.columns:
        print(f"\nCorrelations with {target_col}:")
        target_corr = correlation[target_col].sort_values(ascending=False)
        display(target_corr)

## 8. Categorical Features Analysis

In [None]:
if data is not None:
    # Identify categorical columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Number of categorical features: {len(categorical_cols)}")
    print("Categorical features:", categorical_cols)
    
    if len(categorical_cols) > 0:
        # Show unique values for each categorical column
        for col in categorical_cols:
            n_unique = data[col].nunique()
            print(f"\n{col}: {n_unique} unique values")
            if n_unique <= 10:
                print(data[col].value_counts())

## 9. Outlier Detection

In [None]:
if data is not None and len(numerical_cols) > 0:
    # Create box plots for numerical features
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        if idx < len(axes):
            data.boxplot(column=col, ax=axes[idx])
            axes[idx].set_title(f'Box Plot of {col}')
            axes[idx].set_ylabel(col)
    
    # Hide extra subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 10. Feature Relationships with Target

In [None]:
if data is not None:
    target_col = config['features'].get('target_column', 'default')
    
    if target_col in data.columns and len(numerical_cols) > 0:
        # Remove target from numerical cols if present
        plot_cols = [col for col in numerical_cols if col != target_col]
        
        if len(plot_cols) > 0:
            # Create box plots showing relationship between features and target
            n_cols = 3
            n_rows = (len(plot_cols) + n_cols - 1) // n_cols
            
            fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
            axes = axes.flatten() if n_rows > 1 else [axes]
            
            for idx, col in enumerate(plot_cols[:len(axes)]):
                data.boxplot(column=col, by=target_col, ax=axes[idx])
                axes[idx].set_title(f'{col} by {target_col}')
                axes[idx].set_xlabel(target_col)
                axes[idx].set_ylabel(col)
            
            # Hide extra subplots
            for idx in range(len(plot_cols), len(axes)):
                axes[idx].set_visible(False)
            
            plt.suptitle('')  # Remove default title
            plt.tight_layout()
            plt.show()

## 11. Summary and Conclusions

Based on the exploratory analysis above, document key findings:

1. **Data Quality**: Note any missing values, duplicates, or data quality issues
2. **Target Distribution**: Is the dataset balanced or imbalanced?
3. **Feature Characteristics**: What types of features are present?
4. **Correlations**: Which features are most correlated with the target?
5. **Outliers**: Are there significant outliers that need attention?
6. **Next Steps**: What preprocessing or feature engineering is needed?

Add your observations here after running the analysis.