# 01 | Baseline Model  
*Train a simple logistic regression on the engineered data to establish a performance benchmark.*

In [None]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

## Load Engineered Dataset  
We’ll pull from our `credit_risk_engineered` table in the SQLite DB.

In [None]:
conn = sqlite3.connect("../data/loanvet.db")
df = pd.read_sql_query("SELECT * FROM credit_risk_engineered", conn)
conn.close()
df.head()

## Train/Test Split
Stratify on the target to preserve imbalance.

In [None]:
X = df.drop(columns=["SeriousDlqin2yrs"])
y = df["SeriousDlqin2yrs"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Baseline Logistic Regression
Train with class weights to handle imbalance.

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

pipeline.fit(X_train, y_train)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

## Performance Metrics  
Calculate ROC‑AUC and PR‑AUC.

In [None]:
roc_auc = roc_auc_score(y_test, y_pred_prob)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print(f"ROC‑AUC: {roc_auc:.4f}, PR‑AUC: {pr_auc:.4f}")

## Feature Importance (Coefficients)  
Examine which features carry the most weight.

In [None]:
coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "coef": model.coef_[0]
}).assign(abs_coef=lambda d: d.coef.abs()).sort_values("abs_coef", ascending=False)
coef_df.head(10)


## ROC & Precision‑Recall Curves


In [None]:
import matplotlib.pyplot as plt

# ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label=f"ROC (AUC = {roc_auc:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

# PR curve
plt.plot(recall, precision, label=f"PR (AUC = {pr_auc:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.show()

## Conclusions  
- Baseline ROC‑AUC: 0.8589 
- PR‑AUC: 0.3935  
**Next:** try tree‑based models, hyperparameter tuning, additional feature engineering.