# 01 â€“ Data Understanding

This notebook performs initial data understanding and exploratory analysis for the **FinTech Loan Default Prediction System**.

**Goals**
- Load dataset and inspect shape, schema, and missingness
- Review class balance for the target (`loan_status`)
- Summarize numeric and categorical features
- Plot quick distributions and correlations

> Dataset: `synthetic_lending_club.csv` (generated for demo/testing)

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option('display.max_columns', 100)


In [None]:
# Load data
DATA_PATHS = [
    Path('/mnt/data/loan-default-prediction/data/synthetic_lending_club.csv'),
    Path('data/synthetic_lending_club.csv'),
]

for p in DATA_PATHS:
    if p.exists():
        data_path = p
        break
else:
    raise FileNotFoundError('synthetic_lending_club.csv not found. Place it under data/.')

print('Using data at:', data_path)
df = pd.read_csv(data_path)
df.head()


In [None]:
# Basic shape & schema
print('Shape:', df.shape)
print('\nDtypes:')
print(df.dtypes)


In [None]:
# Missingness overview
miss = df.isna().mean().sort_values(ascending=False)
miss_df = miss.reset_index()
miss_df.columns = ['column','missing_rate']
miss_df.head(20)


In [None]:
# Target distribution
if 'loan_status' not in df.columns:
    raise KeyError('Expected target column loan_status not found')

class_counts = df['loan_status'].value_counts(dropna=False).sort_index()
class_rate = (class_counts / len(df)).round(3)
summary = pd.DataFrame({'count': class_counts, 'rate': class_rate})
summary


In [None]:
# Numeric vs categorical feature lists
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in df.columns if c not in numeric_cols]

print('Numeric columns (n=%d):' % len(numeric_cols))
print(numeric_cols)
print('\nCategorical columns (n=%d):' % len(categorical_cols))
print(categorical_cols)


In [None]:
# Histograms for a sample of numeric columns (excluding target)
cols_to_plot = [c for c in numeric_cols if c != 'loan_status'][:12]
for c in cols_to_plot:
    plt.figure(figsize=(6,4))
    plt.hist(df[c].dropna(), bins=30)
    plt.title(f'Distribution: {c}')
    plt.xlabel(c)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation heatmap (numeric features)
num_for_corr = [c for c in numeric_cols if c != 'loan_status']
if len(num_for_corr) > 1:
    corr = df[num_for_corr].corr(numeric_only=True)
    plt.figure(figsize=(8,6))
    plt.imshow(corr, aspect='auto')
    plt.colorbar()
    plt.title('Correlation Heatmap (numeric features)')
    plt.xticks(range(len(num_for_corr)), num_for_corr, rotation=90)
    plt.yticks(range(len(num_for_corr)), num_for_corr)
    plt.tight_layout()
    plt.show()
else:
    print('Not enough numeric features for a correlation matrix.')


In [None]:
# Top categories for a few categorical columns
cats_preview = categorical_cols[:6]
for c in cats_preview:
    vc = df[c].value_counts().head(10)
    plt.figure(figsize=(7,4))
    plt.bar(vc.index.astype(str), vc.values)
    plt.title(f'Top categories: {c}')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()


In [None]:
# Default rate by selected categorical features
by_cols = [c for c in ['grade','term','purpose','verification_status','home_ownership'] if c in df.columns]
for c in by_cols:
    rate = df.groupby(c)['loan_status'].mean().sort_values(ascending=False)
    plt.figure(figsize=(7,4))
    plt.bar(rate.index.astype(str), rate.values)
    plt.title(f'Default Rate by {c}')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Default Rate (mean loan_status)')
    plt.tight_layout()
    plt.show()


---
### Notes
- All plots use **matplotlib** only (no seaborn).
- This notebook is for quick orientation. Detailed cleaning & encoding live in `02_data_preparation.ipynb`.
