# 01 | Baseline Model  
*Train a simple logistic regression on the engineered data to establish a performance benchmark.*

In [25]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc


## Load Engineered Dataset  
We’ll pull from our `credit_risk_engineered` table in the SQLite DB.

In [26]:
conn = sqlite3.connect("../data/loanvet.db")
df = pd.read_sql_query("SELECT * FROM credit_risk_baseline", conn)
conn.close()
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,...,NumberOfTime30-59DaysPastDueNotWorse_log,NumberOfTimes90DaysLate_log,NumberOfTime60-89DaysPastDueNotWorse_log,TotalDelinquencies_log,HighUtilizationFlag,IncomePerCreditLine,AgeGroup_MidAge,AgeGroup_Senior,DependentsGroup_Small,DependentsGroup_Large
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,...,1.098612,0.0,0.0,1.098612,0,651.428571,1,0,1,0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,...,0.0,0.0,0.0,0.0,1,520.0,1,0,1,0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,...,0.693147,0.693147,0.0,1.098612,0,1014.0,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,...,0.0,0.0,0.0,0.0,0,550.0,1,0,0,0
4,0,0.907239,49,1,0.024926,50000.0,7,0,1,0,...,0.693147,0.0,0.0,0.693147,1,6250.0,1,0,0,0


## Drop Redundant Columns

In [27]:
drop_cols = [
    'RevolvingUtilizationOfUnsecuredLines',
    'MonthlyIncome',
    'DebtRatio',
    'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfTimes90DaysLate',
    'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfTime30-59DaysPastDueNotWorse_log',
    'NumberOfTimes90DaysLate_log',
    'NumberOfTime60-89DaysPastDueNotWorse_log',
    'TotalDelinquencies'
]
df = df.drop(columns=drop_cols, errors="ignore")

## Train/Test Split
Stratify on the target to preserve imbalance.

In [28]:
X = df.drop(columns=["SeriousDlqin2yrs"])
y = df["SeriousDlqin2yrs"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Baseline Logistic Regression
Train with class weights to handle imbalance.

In [29]:
pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

pipeline.fit(X_train, y_train)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

## Performance Metrics  
Calculate ROC‑AUC and PR‑AUC.

In [30]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

roc_auc_scores = []
pr_auc_scores = []

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline = make_pipeline(
        StandardScaler(),
        LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
    )
    pipeline.fit(X_train, y_train)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
    pr_auc = auc(recall, precision)

    roc_auc_scores.append(roc_auc)
    pr_auc_scores.append(pr_auc)

print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.4f} ± {np.std(roc_auc_scores):.4f}")
print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.4f} ± {np.std(pr_auc_scores):.4f}")

Mean ROC-AUC: 0.8556 ± 0.0032
Mean PR-AUC: 0.3706 ± 0.0113


## Feature Importance (Coefficients)  
Examine which features carry the most weight.

In [31]:
logreg_model = pipeline.steps[-1][1]

coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "coef": logreg_model.coef_[0]
})

coef_df = coef_df.assign(abs_coef=lambda d: d.coef.abs()).sort_values("abs_coef", ascending=False)

print(coef_df.head(10))


                                     feature      coef  abs_coef
9                     TotalDelinquencies_log  0.788048  0.788048
6   RevolvingUtilizationOfUnsecuredLines_log  0.689448  0.689448
8                              DebtRatio_log -0.314060  0.314060
0                                        age -0.291139  0.291139
4                 MonthlyIncome_missing_flag  0.265094  0.265094
1            NumberOfOpenCreditLinesAndLoans  0.190339  0.190339
2               NumberRealEstateLoansOrLines  0.136626  0.136626
7                          MonthlyIncome_log -0.135310  0.135310
10                       HighUtilizationFlag  0.040241  0.040241
13                           AgeGroup_Senior -0.035346  0.035346


## Conclusions  
- Mean Baseline ROC‑AUC: 0.8552 ± 0.0039
- Mean PR‑AUC: 0.3713 ± 0.0116