In [1]:
import pandas as pd
bcsc_paths = [
    "../data/bcsc_risk_factors_summarized1_092020.csv",
    "../data/bcsc_risk_factors_summarized2_092020.csv",
    "../data/bcsc_risk_factors_summarized3_092020.csv",
]
df_list = []
for path in bcsc_paths:
    bcsc = pd.read_csv(path)
    df_list.append(bcsc)
bcsc_df = pd.concat(df_list, ignore_index=True)

bcsc_df_clean = (bcsc_df[~(bcsc_df.iloc[:, 2:12] == 9).any(axis=1)].reset_index(drop=True))
bcsc_df_clean.to_csv("../data/bcsc_concatenated_no_9.csv", index=False)
print(f"saved into the data folder")

saved into the data folder


In [2]:
!pip install xgboost




In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import xgboost as xgb
from xgboost import DMatrix
import joblib
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# Load data
df = pd.read_csv("../data/bcsc_concatenated_no_9.csv")
X = df.drop(columns="breast_cancer_history")
y = df["breast_cancer_history"]

# Split off a fixed 20% test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,     
    stratify=y,
    random_state=42
)
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

# Compute imbalance weight on the full train set
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

# Hyperparameter search on training set only
base_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
param_dist = {
    "n_estimators":     [100, 300, 500],
    "max_depth":        [3, 5, 7],
    "learning_rate":    [0.01, 0.05, 0.1],
    "subsample":        [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma":            [0, 1, 5],
}

search = RandomizedSearchCV(
    estimator=base_clf,
    param_distributions=param_dist,
    n_iter=20,
    scoring="recall",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
search.fit(X_train, y_train)
print("Best hyperparameters:", search.best_params_)

# Retrain best estimator on the full training set
best = search.best_estimator_
best.set_params(scale_pos_weight=scale_pos_weight)
best.fit(X_train, y_train)

# Final evaluation on the test set
y_prob = best.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))
print("Test ROC AUC:", roc_auc_score(y_test, y_prob))

cm = confusion_matrix(y_test, y_pred, labels=best.classes_)
print("\nConfusion Matrix:\n", cm)

Training shape: (276051, 12), Test shape: (69013, 12)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best hyperparameters: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
              precision    recall  f1-score   support

           0       0.97      0.78      0.86     58427
           1       0.42      0.88      0.57     10586

    accuracy                           0.79     69013
   macro avg       0.69      0.83      0.71     69013
weighted avg       0.89      0.79      0.82     69013

Test ROC AUC: 0.9079809613266547

Confusion Matrix:
 [[45420 13007]
 [ 1290  9296]]


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, recall_score
import xgboost as xgb
from xgboost import DMatrix
import joblib
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# Load data
df = pd.read_csv("../data/bcsc_concatenated_no_9.csv")
X = df.drop(columns="breast_cancer_history")
y = df["breast_cancer_history"]

# Split off a fixed 20% test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,     
    stratify=y,
    random_state=42
)
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("clf", xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42,
    )),
])
param_dist = {
    "clf__n_estimators":     [100, 300, 500, 800],
    "clf__max_depth":        [3, 5, 7],
    "clf__learning_rate":    [0.01, 0.05, 0.1],
    "clf__subsample":        [0.6, 0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
    "clf__gamma":            [0, 1, 5]
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=30,
    scoring="recall",
    cv=3,
    verbose=2,
    random_state=42,   n_jobs=-1
)
search.fit(X_train, y_train)

# 3) Retrain with early stopping
best_pipe = search.best_estimator_
best_pipe.named_steps['clf'].set_params(
    **{"scale_pos_weight":(neg/pos)}
)
best_pipe.named_steps['clf'].fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=0
)

# 4) Threshold tuning
probs = best_pipe.predict_proba(X_test)[:,1]
best_thresh, best_recall = 0.5, 0
for thresh in np.linspace(0.1, 0.9, 81):
    preds = (probs >= thresh).astype(int)
    r = recall_score(y_test, preds)
    if r > best_recall:
        best_recall, best_thresh = r, thresh

print(f"Best threshold for recall: {best_thresh:.2f} → recall = {best_recall:.3f}")

# Then use it:
y_pred = (probs >= best_thresh).astype(int)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=best.classes_)
print("\nConfusion Matrix:\n", cm)

Training shape: (276051, 12), Test shape: (69013, 12)
Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best threshold for recall: 0.10 → recall = 1.000
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     58427
           1       0.15      1.00      0.27     10586

    accuracy                           0.15     69013
   macro avg       0.08      0.50      0.13     69013
weighted avg       0.02      0.15      0.04     69013


Confusion Matrix:
 [[    0 58427]
 [    0 10586]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    roc_curve
)

import xgboost as xgb
from xgboost import DMatrix
import joblib
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# Load data
df = pd.read_csv("../data/bcsc_concatenated_no_9.csv")
X = df.drop(columns="breast_cancer_history")
y = df["breast_cancer_history"]

# Split off a fixed 20% test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,     
    stratify=y,
    random_state=42
)
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

# Compute imbalance weight on the full train set
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
# Hyperparameter search on training set only
base_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
param_dist = {
    "n_estimators":     [100, 300, 500],
    "max_depth":        [3, 5, 7],
    "learning_rate":    [0.01, 0.05, 0.1],
    "subsample":        [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma":            [0, 1, 5],
}

search = RandomizedSearchCV(
    estimator=base_clf,
    param_distributions=param_dist,
    n_iter=20,
    scoring="recall",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
search.fit(X_train, y_train)
print("Best hyperparameters:", search.best_params_)

# Retrain best estimator on the full training set
best = search.best_estimator_
best.set_params(scale_pos_weight=scale_pos_weight)
best.fit(X_train, y_train)
# Final evaluation on the test set
y_prob = best.predict_proba(X_test)[:, 1]
# Compute G-mean for each ROC threshold
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
gmeans = np.sqrt(tpr * (1 - fpr))
best_idx = np.argmax(gmeans)
best_thresh_gmean = thresholds[best_idx]
best_gmean = gmeans[best_idx]

print(f"Best threshold by G-mean: {best_thresh_gmean:.3f} → G-mean = {best_gmean:.3f}")

# Compare performance at default (0.5) vs. G-mean threshold

y_pred = (y_prob >= 0.5).astype(int)
print(f"\n=== Threshold: {label} ({thresh:.3f}) ===")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=best.classes_)
print("Confusion Matrix:\n", cm)

# 3) Also print overall ROC AUC
print("\nTest ROC AUC:", roc_auc_score(y_test, y_prob))

Training shape: (276051, 12), Test shape: (69013, 12)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best hyperparameters: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
Best threshold by G-mean: 0.527 → G-mean = 0.828

=== Threshold: G-mean (0.527) ===
              precision    recall  f1-score   support

           0       0.97      0.78      0.86     58427
           1       0.42      0.88      0.57     10586

    accuracy                           0.79     69013
   macro avg       0.69      0.83      0.71     69013
weighted avg       0.89      0.79      0.82     69013

Confusion Matrix:
 [[45420 13007]
 [ 1290  9296]]

Test ROC AUC: 0.9079809613266547
