# 04 - Model Pipeline Baselines (V3)

- Train/test split (stratified)
- ColumnTransformer (scale + encode)
- Baselines: Majority (implicit), LogisticRegression(balanced), DecisionTree(balanced)
- Metrics: Accuracy, Precision, Recall, F1 macro/weighted, ROC-AUC, PR-AUC, Brier


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, brier_score_loss)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

INP = Path('../v3_data/employee_promotion_features.csv')
ART = Path('../v3_artifacts'); ART.mkdir(exist_ok=True)

df = pd.read_csv(INP)
TARGET = 'Promotion_Eligible'
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

candidates = {
    'logreg': LogisticRegression(max_iter=2000, class_weight='balanced'),
    'dt': DecisionTreeClassifier(random_state=42, class_weight='balanced')
}

rows = []
for name, model in candidates.items():
    pipe = Pipeline([('pre', pre), ('model', model)])
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    rows.append({
        'model': name,
        'accuracy': accuracy_score(y_test, pred),
        'precision': precision_score(y_test, pred),
        'recall': recall_score(y_test, pred),
        'f1_macro': f1_score(y_test, pred, average='macro'),
        'f1_weighted': f1_score(y_test, pred, average='weighted'),
        'rocauc': roc_auc_score(y_test, proba),
        'prauc': average_precision_score(y_test, proba),
        'brier': brier_score_loss(y_test, proba)
    })

res = pd.DataFrame(rows)
res.to_csv(ART / 'baseline_metrics.csv', index=False)
res
