# Credit Default Analysis

This notebook performs exploratory data analysis (EDA) and builds a machine‑learning model to predict whether a credit‑card client will default on their payment in the following month.  If the full dataset is present in `../data/default_of_credit_card_clients.csv` the notebook will load it; otherwise a synthetic dataset is generated for demonstration purposes.

## 1. Setup

Import necessary libraries.

In [None]:

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt


## 2. Load data

Attempt to load the credit card default dataset from the `../data` folder.  If the file is not found, a synthetic dataset is generated as a fallback.

In [None]:

# Path to the dataset (expects the CSV to be placed in the data/ folder)
data_path = os.path.join('..', 'data', 'default_of_credit_card_clients.csv')

if os.path.exists(data_path):
    print(f"Loading dataset from {data_path}...")
    df = pd.read_csv(data_path)
else:
    print("Dataset file not found. Generating synthetic dataset for demonstration.")
    # generate synthetic dataset with similar structure
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'LIMIT_BAL': np.random.normal(200000, 50000, n_samples).astype(int),
        'SEX': np.random.choice([1, 2], n_samples),
        'EDUCATION': np.random.choice([1, 2, 3, 4], n_samples),
        'MARRIAGE': np.random.choice([1, 2, 3], n_samples),
        'AGE': np.random.randint(21, 70, n_samples),
        'PAY_0': np.random.randint(-2, 9, n_samples),
        'PAY_2': np.random.randint(-2, 9, n_samples),
        'PAY_3': np.random.randint(-2, 9, n_samples),
        'PAY_4': np.random.randint(-2, 9, n_samples),
        'PAY_5': np.random.randint(-2, 9, n_samples),
        'PAY_6': np.random.randint(-2, 9, n_samples),
        'BILL_AMT1': np.random.uniform(0, 1e5, n_samples),
        'BILL_AMT2': np.random.uniform(0, 1e5, n_samples),
        'BILL_AMT3': np.random.uniform(0, 1e5, n_samples),
        'BILL_AMT4': np.random.uniform(0, 1e5, n_samples),
        'BILL_AMT5': np.random.uniform(0, 1e5, n_samples),
        'BILL_AMT6': np.random.uniform(0, 1e5, n_samples),
        'PAY_AMT1': np.random.uniform(0, 1e4, n_samples),
        'PAY_AMT2': np.random.uniform(0, 1e4, n_samples),
        'PAY_AMT3': np.random.uniform(0, 1e4, n_samples),
        'PAY_AMT4': np.random.uniform(0, 1e4, n_samples),
        'PAY_AMT5': np.random.uniform(0, 1e4, n_samples),
        'PAY_AMT6': np.random.uniform(0, 1e4, n_samples),
    })
    # assign a synthetic default flag with 20% positive rate
    df['default payment next month'] = (np.random.rand(n_samples) < 0.2).astype(int)

# Display the first few rows
print(df.head())
print("
Class distribution:
", df['default payment next month'].value_counts(normalize=True))


## 3. Prepare data for modelling

Select feature columns, standardise them and split the data into training and test sets.

In [None]:

# Remove non‑feature columns
feature_columns = [col for col in df.columns if col not in ['ID', 'default payment next month']]
X = df[feature_columns]
y = df['default payment next month']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Standardise features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 4. Train logistic regression model

Fit a logistic regression classifier and evaluate its performance.

In [None]:

# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Evaluation metrics
print("Classification Report:
", classification_report(y_test, y_pred))
print("Confusion Matrix:
", confusion_matrix(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


## 5. Plot ROC curve

Visualise the trade‑off between the true positive rate and the false positive rate.

In [None]:

fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure()
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
