In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Read the CSV file
df = pd.read_csv('ANON36_MACE_EKG_output.csv')

In [3]:
# change variable names
df = df.rename(columns={
    'Risk- Hypertension' : 'Hypertension',
    'Risk- Hypercholesteremia' : 'Hypercholesteremia', 
    'Risk- Hypertriglyceridemia' : 'Hypertriglyceridemia',
    'Risk- High HDL' : 'High HDL', 
    'Risk- Low HDL' : 'Low HDL', 
    'Risk- Diabetes' : 'Diabetes', 
    'Risk- Family Hx' : 'Family Hx',
    'Risk- Smoke' : 'Smoke', 
    'ASCVD PCE Risk Score' :'ASCVD PCE Risk',
    'PREVENT Score - ASCVD 10-year risk' : 'PREVENT ASCVD Risk', 
    'CVD PREVENT 10yr' : 'PREVENT CVD Risk',
    'HF PREVENT 10-year' : 'PREVENT HF Risk', 
    'Total A' : 'Total CAC Score', 
    'LM A' : 'CAC Score (Left Main)', 
    'LAD A' : 'CAC Score (LAD)', 
    'Cx' : 'CAC Score (LCx)', 
    'RCA' : 'CAC Score (RCA)',
    'Num Lesions' : 'Number of Total Lesions', 
    'Lesion Vol Sum' : 'Total Volume Score', 
    'Lesion Max D' : 'Peak CAC Density', 
    'Lesion Ave D' : 'Mean CAC Density', 
    'Male' : 'Gender',
    'Race_encoded' : 'Race',
})


In [4]:
# Select the columns
X = df.drop(['CONFIRM ID', 'result'], axis=1)
y = df['result']

# Make some columns categorical
categorical_cols = [X.columns[i] for i in [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,45]]
for col in categorical_cols:
    X[col] = X[col].astype('category')

In [5]:
# Initialize StratifiedKFold for 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=94)

# Create the XGBClassifier
clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, random_state=42)

# Placeholder for the stacked probabilities
stacked_probabilities = np.zeros(y.shape)

# Cross-validation loop
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    # Predict probabilities
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    stacked_probabilities[test_index] = y_pred_proba

In [6]:
# Final predictions
threshold = 0.5
y_pred = (stacked_probabilities >= threshold).astype(int)

In [7]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

# Calculate Rates
total = tn + fp + fn + tp
tp_rate = tp / (tp + fn)  # Recall/Sensitivity
fp_rate = fp / (fp + tn)
tn_rate = tn / (tn + fp)
fn_rate = fn / (tp + fn)

For all:

In [9]:
# Print Results
print(f"Final AUC-ROC: {roc_auc_score(y, stacked_probabilities):.4f}")
print("\nConfusion Matrix:")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")

print("\nRates/Percentages:")
print(f"TP Rate (Recall): {tp_rate:.2%}")
print(f"FP Rate: {fp_rate:.2%}")
print(f"TN Rate: {tn_rate:.2%}")
print(f"FN Rate: {fn_rate:.2%}")

Final AUC-ROC: 0.7551

Confusion Matrix:
TP: 21, FP: 34, TN: 26617, FN: 536

Rates/Percentages:
TP Rate (Recall): 3.77%
FP Rate: 0.13%
TN Rate: 99.87%
FN Rate: 96.23%


By Gender

In [9]:
# Add predictions to the dataframe
df['predicted'] = y_pred
df['predicted_proba'] = stacked_probabilities

# Filter data for males and females
male_data = df[df['Gender'] == 1]  # Assuming 1 = Male
female_data = df[df['Gender'] == 0]  # Assuming 0 = Female

# Function to calculate confusion matrix and rates
def calculate_metrics(group, group_name):
    tn, fp, fn, tp = confusion_matrix(group['result'], group['predicted']).ravel()
    tp_rate = tp / (tp + fn)
    fp_rate = fp / (fp + tn)
    tn_rate = tn / (tn + fp)
    fn_rate = fn / (tp + fn)
    
    print(f"\n{group_name} Metrics:")
    print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
    print(f"TP Rate (Recall): {tp_rate:.2%}")
    print(f"FP Rate: {fp_rate:.2%}")
    print(f"TN Rate: {tn_rate:.2%}")
    print(f"FN Rate: {fn_rate:.2%}")

# Calculate metrics for males and females
calculate_metrics(male_data, "Male")
calculate_metrics(female_data, "Female")


Male Metrics:
TP: 12, FP: 22, TN: 14949, FN: 351
TP Rate (Recall): 3.31%
FP Rate: 0.15%
TN Rate: 99.85%
FN Rate: 96.69%

Female Metrics:
TP: 9, FP: 12, TN: 10914, FN: 185
TP Rate (Recall): 4.64%
FP Rate: 0.11%
TN Rate: 99.89%
FN Rate: 95.36%
