In [None]:
import pandas as pd


In [None]:
import sys

project_root = "/Emilys_code/"
sys.path.insert(0, project_root)
print(sys.path[0])

from preprocessing.split_format2 import split_and_format_data
from models.xgboost_anne4 import train_xgboost_model_random


In [None]:
DATA_PATH = "/cohort65_to_69.parquet"

RANDOM_STATE = 42
MODEL_NAME = "Model5"
MAXIMIZE_METRIC = "f2"

COHORT_NAME = "Male_65-69"

In [None]:
# Load, split, preprocess
(
    X_train_raw,
    X_test_raw,
    y_train,
    y_test,
    id_train,
    id_test,
    preprocessor,
) = split_and_format_data(
    data_path=DATA_PATH,
    drop_cols=[
        "pnr",
        "family_id",
        "in_dk",
        "de_age",
        "alive",
        "de_parish",
        "de_region",
        "de_municipality",
        "de_time_to_death",
        "de_age_at_death",
        "se_educ_date",
        "de_sex",
    ],
    sex_filter=["Male"],
    target_col="early_death",
    test_size=0.3,
    random_state=RANDOM_STATE,
    stratify_on_year=True,
    year_col="year",
)

print("Train shape:", X_train_raw.shape)
print("Test shape:", X_test_raw.shape)
print("Train death rate:", y_train.mean())
print("Test death rate:", y_test.mean())

In [None]:
# Scale pos weight
spw = (1 - y_train.mean()) / y_train.mean()
print(spw)

In [None]:
# Hyperparameters
param_grid = {
    "max_depth": [2, 3, 4],
    "learning_rate": [0.01, 0.03, 0.05, 0.08],
    "n_estimators": [400, 800, 1200],
    "subsample": [0.5, 0.6, 0.7, 0.8],
    "colsample_bytree": [0.3, 0.5, 0.7, 1.0],
    "gamma": [0, 0.5, 1, 2, 5, 10],
    "min_child_weight": [1, 5, 10, 20],
    "reg_lambda": [1, 5, 10, 20, 40],
    "reg_alpha": [0, 0.1, 0.5, 1.0],
}

In [None]:
search_seed = abs(hash(COHORT_NAME)) % (2**32)

model5_best_model, model5_best_params, model5_thr = train_xgboost_model_random(
    X_train_raw,
    y_train,
    preprocessor=preprocessor,
    param_grid=param_grid,
    cv_folds=3,
    random_state=search_seed,
    maximize="f2",
    scale_pos_weight=spw,
    n_iter=80,
)

In [None]:
# Find optimal threshold restricted to 0.3â€“0.7
from sklearn.metrics import precision_recall_curve
import numpy as np

y_prob_train = model5_best_model.predict_proba(X_train_raw)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_train, y_prob_train)

beta = 2
p = precision[:-1]
r = recall[:-1]
f2 = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)

low, high = 0.3, 0.7
mask = (thresholds >= low) & (thresholds <= high)

best_f2 = f2[mask].max()
best_idx = np.where(f2[mask] >= best_f2 - 0.002)[0][0]
model5_thr = float(thresholds[mask][best_idx])

print("Chosen threshold:", model5_thr)
print("Precision / Recall:", p[mask][best_idx], r[mask][best_idx])
print("F2:", f2[mask][best_idx])

In [None]:
# Evaluation on test set
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
    balanced_accuracy_score,
    confusion_matrix,
    accuracy_score,
    fbeta_score,
    roc_auc_score,
)

y_prob = model5_best_model.predict_proba(X_test_raw)[:, 1]
y_pred = (y_prob >= model5_thr).astype(int)

print("F1:", f1_score(y_test, y_pred))
print("F2:", fbeta_score(y_test, y_pred, beta=2))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("PR-AUC:", average_precision_score(y_test, y_prob))
print("Balanced acc:", balanced_accuracy_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

print(cm)
print(f"TN:{tn}, FP:{fp}, FN:{fn}, TP:{tp}")
print("Specificity:", specificity)

In [None]:
# Predicted probability gap
X_survivors = X_test_raw[y_test == 0].copy()
X_deaths = X_test_raw[y_test == 1].copy()

p_survivors = model5_best_model.predict_proba(X_survivors)[:, 1].mean()
p_deaths = model5_best_model.predict_proba(X_deaths)[:, 1].mean()

print("Survivors mean p:", p_survivors)
print("Deaths mean p:", p_deaths)
print("Gap:", p_deaths - p_survivors)