In [1]:
import pandas as pd
from collections import Counter
import ast
import numpy as np
import matplotlib.pyplot as plt
import datetime
# from skimage.io import imread
import os
import statistics
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train_data = pd.read_csv('final_train_data.csv')

In [3]:
# One-hot encode the categorical columns
train_data = pd.get_dummies(train_data, columns=['Sex'])
train_data = pd.get_dummies(train_data, columns=['Frontal/Lateral'])
train_data = pd.get_dummies(train_data, columns=['AP/PA'])

In [4]:
train_data

Unnamed: 0,Path,Age,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,...,Asymmetry,Sex_Female,Sex_Male,Sex_Unknown,Frontal/Lateral_Frontal,Frontal/Lateral_Lateral,AP/PA_AP,AP/PA_LL,AP/PA_PA,AP/PA_RL
0,CheXpert-v1.0-small/train/patient00001/study1/...,68,1.0,,,,,,,,...,0.384764,True,False,False,True,False,True,False,False,False
1,CheXpert-v1.0-small/train/patient00002/study2/...,87,,,-1.0,1.0,,-1.0,-1.0,,...,0.596201,True,False,False,True,False,True,False,False,False
2,CheXpert-v1.0-small/train/patient00002/study1/...,83,,,,1.0,,,-1.0,,...,0.482320,True,False,False,True,False,True,False,False,False
3,CheXpert-v1.0-small/train/patient00002/study1/...,83,,,,1.0,,,-1.0,,...,,True,False,False,False,True,False,False,False,False
4,CheXpert-v1.0-small/train/patient00003/study1/...,41,,,,,,1.0,,,...,0.417489,False,True,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,CheXpert-v1.0-small/train/patient64537/study2/...,59,,,,-1.0,,,,,...,0.368473,False,True,False,True,False,True,False,False,False
223410,CheXpert-v1.0-small/train/patient64537/study1/...,59,,,,-1.0,,,,0.0,...,0.319442,False,True,False,True,False,True,False,False,False
223411,CheXpert-v1.0-small/train/patient64538/study1/...,0,,,,,,-1.0,,,...,0.557926,True,False,False,True,False,True,False,False,False
223412,CheXpert-v1.0-small/train/patient64539/study1/...,0,,,1.0,1.0,,,,-1.0,...,0.704381,True,False,False,True,False,True,False,False,False


In [55]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

def prepare_data(data, feature_cols, pca=False, pca_components=0.8):
    """
    Prepares data for cross-validation (no train-test split).
    Returns:
        X: Processed features (numpy array)
        y: Encoded labels (numpy array)
        encoder: LabelEncoder (for inverse_transform if needed)
        imputer: SimpleImputer (for new data)
        scaler: StandardScaler (if PCA=True)
        pca: PCA object (if PCA=True)
    """
    # Drop rows with NaN in target and select features
    data = data.dropna(subset=['Pleural Effusion'])
    X = data[feature_cols]
    y = data['Pleural Effusion']

    # Encode target
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Optional PCA pipeline
    if pca:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_imputed)
        pca_obj = PCA(n_components=pca_components)
        X_processed = pca_obj.fit_transform(X_scaled)
        print(f"Explained Variance: {pca_obj.explained_variance_ratio_.sum():.2f}")
        return X_processed, y_encoded, encoder, imputer, scaler, pca_obj

    return X_imputed, y_encoded, encoder, imputer, None, None

In [61]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss

def multiclass_brier_score(y_true, y_prob):
    """Compute Brier score for multi-class problems (one-vs-rest)."""
    n_classes = y_prob.shape[1]
    brier_scores = []
    for class_idx in range(n_classes):
        y_true_binary = (y_true == class_idx).astype(int)
        brier_scores.append(brier_score_loss(y_true_binary, y_prob[:, class_idx]))
    return np.mean(brier_scores)  # Average across classes

def compute_calibration_metrics(y_true, y_prob, n_bins=10):
    """Compute ECE, MCE, and Brier for multi-class."""
    bin_edges = np.linspace(0, 1, n_bins + 1)
    ece, mce = 0.0, 0.0
    
    # Confidence is max probability, accuracy is whether prediction was correct
    confidences = y_prob.max(axis=1)
    predictions = np.argmax(y_prob, axis=1)
    accuracies = (predictions == y_true).astype(float)
    
    # Bin by confidence
    bin_indices = np.digitize(confidences, bin_edges, right=True) - 1
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)
    
    for i in range(n_bins):
        mask = bin_indices == i
        if np.sum(mask) == 0:
            continue
        acc = np.mean(accuracies[mask])
        conf = np.mean(confidences[mask])
        weight = np.sum(mask) / len(y_true)
        ece += weight * np.abs(acc - conf)
        mce = max(mce, np.abs(acc - conf))
    
    brier = multiclass_brier_score(y_true, y_prob)  # Updated Brier calculation
    return ece, mce, brier

In [62]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def run_xgboost_cv(X, y, n_folds=5, num_class=3):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    auc_scores, acc_scores, ece_scores, mce_scores, brier_scores = [], [], [], [], []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric='mlogloss',
            objective='multi:softprob',
            num_class=num_class
        )
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_val)  # Shape: (n_samples, n_classes)
        
        # Compute metrics
        auc = roc_auc_score(y_val, y_prob, multi_class='ovr')
        acc = accuracy_score(y_val, np.argmax(y_prob, axis=1))
        ece, mce, brier = compute_calibration_metrics(y_val, y_prob)
        
        # Store results
        auc_scores.append(auc)
        acc_scores.append(acc)
        ece_scores.append(ece)
        mce_scores.append(mce)
        brier_scores.append(brier)
    
    return {
        'AUC': (np.mean(auc_scores), np.std(auc_scores)),
        'Accuracy': (np.mean(acc_scores), np.std(acc_scores)),
        'ECE': (np.mean(ece_scores), np.std(ece_scores)),
        'MCE': (np.mean(mce_scores), np.std(mce_scores)),
        'Brier': (np.mean(brier_scores), np.std(brier_scores))
    }

In [94]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, GlobalAveragePooling2D, Reshape, Conv2D, Add
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.mixed_precision import set_global_policy

# Enable mixed precision for faster training (if GPU available)
set_global_policy('mixed_float16')

def build_fast_model(input_shape, num_classes=3):
    """Optimized architecture for speed and performance"""
    inputs = Input(shape=input_shape)
    
    # Input processing
    if len(input_shape) == 1:  # Flattened features
        x = Dense(128)(inputs)  # Reduced from 256
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Reshape((8, 8, 2))(x)  # Smaller spatial dimensions
    else:  # Image input
        x = inputs
    
    # Simplified convolutional blocks
    x = Conv2D(32, 3, padding='same')(x)  # Reduced filters
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Stack of efficient residual blocks
    for filters in [32, 64, 128]:  # Reduced filter sizes
        # Skip connection
        shortcut = Conv2D(filters, 1)(x) if x.shape[-1] != filters else x
        
        # Main path
        x = Conv2D(filters, 3, padding='same')(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(filters, 3, padding='same')(x)
        x = BatchNormalization()(x)
        
        x = Add()([x, shortcut])
        x = ReLU()(x)
    
    # Classification head
    x = GlobalAveragePooling2D()(x)
    x = Dense(64, activation='relu')(x)  # Reduced units
    outputs = Dense(num_classes, activation='softmax', dtype='float32')(x)
    
    return Model(inputs=inputs, outputs=outputs)

def run_fast_cv(X, y, num_classes=3, n_folds=5):
    """Optimized 5-fold CV with all metrics"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    results = {
        'AUC': [], 'Accuracy': [], 
        'ECE': [], 'MCE': [], 'Brier': []
    }
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nFold {fold+1}/{n_folds}")
        
        # Data split
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Build and train
        model = build_fast_model(X_train.shape[1:], num_classes)
        model.compile(
            optimizer=Adam(0.002),  # Increased learning rate
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=30,  # Reduced epochs
            batch_size=64,  # Increased batch size
            callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
            verbose=1
        )
        
        # Predict and evaluate
        y_prob = model.predict(X_val, verbose=0, batch_size=128)  # Larger prediction batch
        y_pred = np.argmax(y_prob, axis=1)
        
        # Calculate metrics
        results['AUC'].append(roc_auc_score(y_val, y_prob, multi_class='ovr'))
        results['Accuracy'].append(accuracy_score(y_val, y_pred))
        
        ece, mce, brier = compute_calibration_metrics(y_val, y_prob)
        results['ECE'].append(ece)
        results['MCE'].append(mce)
        results['Brier'].append(brier)
        
        print(f"Fold {fold+1} | AUC: {results['AUC'][-1]:.4f} | Time: {history.history['time'][-1]:.1f}s")
    
    return {
        metric: (np.mean(values), np.std(values))
        for metric, values in results.items()
    }

def compute_calibration_metrics(y_true, y_prob, n_bins=5):  # Reduced bins for speed
    """Faster calibration metrics calculation"""
    bin_edges = np.linspace(0, 1, n_bins + 1)
    ece, mce = 0.0, 0.0
    
    conf = np.max(y_prob, axis=1)
    pred = np.argmax(y_prob, axis=1)
    acc = (pred == y_true).astype(float)
    
    bin_indices = np.digitize(conf, bin_edges, right=True) - 1
    bin_indices = np.clip(bin_indices, 0, n_bins - 1)
    
    for i in range(n_bins):
        mask = bin_indices == i
        if mask.sum() == 0:
            continue
        bin_acc = np.mean(acc[mask])
        bin_conf = np.mean(conf[mask])
        weight = mask.sum() / len(y_true)
        ece += weight * np.abs(bin_acc - bin_conf)
        mce = max(mce, np.abs(bin_acc - bin_conf))
    
    # Multi-class Brier score
    brier = np.mean([
        brier_score_loss((y_true == i).astype(int), y_prob[:, i])
        for i in range(y_prob.shape[1])
    ])
    
    return ece, mce, brier

In [64]:
train_data.columns[-20:]

Index(['corner_192', 'corner_193', 'corner_194', 'corner_195', 'corner_196',
       'corner_197', 'corner_198', 'corner_199', 'Right_Lung_Intensity',
       'Left_Lung_Intensity', 'Asymmetry', 'Sex_Female', 'Sex_Male',
       'Sex_Unknown', 'Frontal/Lateral_Frontal', 'Frontal/Lateral_Lateral',
       'AP/PA_AP', 'AP/PA_LL', 'AP/PA_PA', 'AP/PA_RL'],
      dtype='object')

In [65]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
feature_columns =  ['Age', 'No Finding', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax',
       'Pleural Other', 'Fracture', 'Support Devices', 'Sex_Female', 'Sex_Male',
       'Sex_Unknown', 'Frontal/Lateral_Frontal', 'Frontal/Lateral_Lateral',
       'AP/PA_AP', 'AP/PA_LL', 'AP/PA_PA', 'AP/PA_RL'] # Modify as needed

# Prepare full dataset
X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data, 
    feature_cols=feature_columns, 
    pca=False  # Set to True if using PCA
)

# Run 5-fold CV (using the function from earlier)
cv_metrics = run_xgboost_cv(X, y, num_class=3)
print("CV Results:", cv_metrics)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Results: {'AUC': (0.8762353007975443, 0.0015492341947760415), 'Accuracy': (0.845770994853045, 0.0013957687128646706), 'ECE': (0.0049342679151966615, 0.0012160366759434128), 'MCE': (0.05664789311022626, 0.04180863883596941), 'Brier': (0.08071883831944418, 0.0005621965577145979)}


In [95]:
# Correct way to drop multiple columns
feature_columns =  ['Age', 'No Finding', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax',
       'Pleural Other', 'Fracture', 'Support Devices', 'Sex_Female', 'Sex_Male',
       'Sex_Unknown', 'Frontal/Lateral_Frontal', 'Frontal/Lateral_Lateral',
       'AP/PA_AP', 'AP/PA_LL', 'AP/PA_PA', 'AP/PA_RL'] # Modify as needed

# 1. Prepare your data
X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data,
    feature_cols=feature_columns,
    pca=False
)

# 2. Run optimized CV
cv_results = run_fast_cv(X, y, num_classes=3)

# 3. View results
print("\nOptimized Model Results:")
for metric, (mean, std) in cv_results.items():
    print(f"{metric}: {mean:.4f} ± {std:.4f}")


Fold 1/5
Epoch 1/30
[1m1666/1666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 87ms/step - accuracy: 0.7364 - loss: 0.6834 - val_accuracy: 0.7012 - val_loss: 0.8291
Epoch 2/30
[1m 200/1666[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:55[0m 78ms/step - accuracy: 0.8007 - loss: 0.5688

# TRAIN AND TEST WITHOUT CLINICAL FEATURES

In [66]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
train_data_no_clinical = train_data.drop(['Pleural Effusion', 'Path','Right_Lung_Intensity', 'Left_Lung_Intensity', 'Asymmetry', ], axis=1)
feature_columns = train_data_no_clinical.columns  # Modify as needed

# Prepare the data
X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data, 
    feature_cols=feature_columns, 
    pca=False  # Set to True if using PCA
)

# Run 5-fold CV (using the function from earlier)
cv_metrics = run_xgboost_cv(X, y, num_class=3)
print("CV Results:", cv_metrics)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Results: {'AUC': (0.8643145485188984, 0.0006820547305408676), 'Accuracy': (0.8389021932890671, 0.0011131276740131738), 'ECE': (0.01010382077930666, 0.0011881417048674743), 'MCE': (0.07259024703723263, 0.04467180402437322), 'Brier': (0.0841902629498664, 0.0004480080338447924)}


In [None]:
train_data_no_clinical = train_data.drop(['Pleural Effusion', 'Path','Right_Lung_Intensity', 'Left_Lung_Intensity', 'Asymmetry', ], axis=1)
feature_columns = train_data_no_clinical.columns  # Modify as needed

X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data,
    feature_cols=feature_columns,
    pca=False
)

# 2. Run robust 5-fold cross-validation
cv_results = run_robust_cv(X, y, num_classes=3)

# 3. View comprehensive results
print("\nFinal Robust Model CV Results:")
for metric, (mean, std) in sorted(cv_results.items()):
    print(f"{metric}: {mean:.4f} ± {std:.4f}")


Fold 1/5
Epoch 1/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6283 - loss: 2.5718 - val_accuracy: 0.6470 - val_loss: 0.8434
Epoch 2/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6509 - loss: 0.8407 - val_accuracy: 0.6470 - val_loss: 0.8440
Epoch 3/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6518 - loss: 0.8357 - val_accuracy: 0.6470 - val_loss: 0.8350
Epoch 4/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6503 - loss: 0.8393 - val_accuracy: 0.6470 - val_loss: 0.8232
Epoch 5/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6509 - loss: 0.8302 - val_accuracy: 0.6470 - val_loss: 0.8211
Epoch 6/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6512 - loss: 0.8301 - val_accuracy: 0.6470 - val_loss: 0.8155
Epoch

# TRAIN AND TEST WITH CLINICAL FEATURES

In [67]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
train_data_clinical = train_data.drop(['Pleural Effusion', 'Path'], axis=1)
feature_columns = train_data_clinical.columns  # Modify as needed

# Prepare the data
X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data, 
    feature_cols=feature_columns, 
    pca=False  # Set to True if using PCA
)

# Run 5-fold CV (using the function from earlier)
cv_metrics = run_xgboost_cv(X, y, num_class=3)
print("CV Results:", cv_metrics)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Results: {'AUC': (0.8701672046997343, 0.0010127902693060672), 'Accuracy': (0.8418373838498505, 0.0011894381820696546), 'ECE': (0.009779595511506619, 0.0014650473499585096), 'MCE': (0.037831661027525644, 0.008845112620963042), 'Brier': (0.08273710357479687, 0.0005258635607662919)}


In [None]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
train_data_clinical = train_data.drop(['Pleural Effusion', 'Path'], axis=1)
feature_columns = train_data_clinical.columns  # Modify as needed

X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data,
    feature_cols=feature_columns,
    pca=False
)

# 2. Run robust 5-fold cross-validation
cv_results = run_robust_cv(X, y, num_classes=3)

# 3. View comprehensive results
print("\nFinal Robust Model CV Results:")
for metric, (mean, std) in sorted(cv_results.items()):
    print(f"{metric}: {mean:.4f} ± {std:.4f}")


Fold 1/5
Epoch 1/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6292 - loss: 2.4352 - val_accuracy: 0.6608 - val_loss: 0.8186
Epoch 2/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6576 - loss: 0.8287 - val_accuracy: 0.6470 - val_loss: 0.8175
Epoch 3/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6657 - loss: 0.8103 - val_accuracy: 0.6777 - val_loss: 0.7923
Epoch 4/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6690 - loss: 0.7975 - val_accuracy: 0.6811 - val_loss: 0.7887
Epoch 5/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6715 - loss: 0.7933 - val_accuracy: 0.6816 - val_loss: 0.7817
Epoch 6/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6726 - loss: 0.7864 - val_accuracy: 0.6792 - val_loss: 0.7774
Epoch

# Stats + Clinical

In [74]:
columns_to_drop = [col for col in train_data.columns if col.startswith(('hist', 'cornr'))]
train_data_stats_clinical = train_data.drop(columns=columns_to_drop)

# Correct way to drop multiple columns
train_data_stats_clinical = train_data_stats_clinical.drop(['Pleural Effusion', 'Path'], axis=1)
feature_columns = train_data_stats_clinical.columns  # Modify as needed

# Prepare the data
X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data, 
    feature_cols=feature_columns, 
    pca=False  # Set to True if using PCA
)

# Run 5-fold CV (using the function from earlier)
cv_metrics = run_xgboost_cv(X, y, num_class=3)
print("CV Results:", cv_metrics)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Results: {'AUC': (0.8722005686867729, 0.0013585147939188939), 'Accuracy': (0.8429784277739873, 0.0016597529627462371), 'ECE': (0.005762527009482558, 0.0010061351053880696), 'MCE': (0.05721774783752951, 0.0364422951008462), 'Brier': (0.08213836406735485, 0.0006741073160065653)}


In [None]:
columns_to_drop = [col for col in train_data.columns if col.startswith(('hist', 'cornr'))]
train_data_stats_clinical = train_data.drop(columns=columns_to_drop)

# Correct way to drop multiple columns
train_data_stats_clinical = train_data_stats_clinical.drop(['Pleural Effusion', 'Path'], axis=1)
feature_columns = train_data_stats_clinical.columns  # Modify as needed

X, y, encoder, imputer, scaler, pca_obj = prepare_data(
    train_data,
    feature_cols=feature_columns,
    pca=False
)

# 2. Run robust 5-fold cross-validation
cv_results = run_robust_cv(X, y, num_classes=3)

# 3. View comprehensive results
print("\nFinal Robust Model CV Results:")
for metric, (mean, std) in sorted(cv_results.items()):
    print(f"{metric}: {mean:.4f} ± {std:.4f}")


Fold 1/5
Epoch 1/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6279 - loss: 2.3950 - val_accuracy: 0.6667 - val_loss: 0.8244
Epoch 2/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6538 - loss: 0.8326 - val_accuracy: 0.6470 - val_loss: 0.8155
Epoch 3/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6589 - loss: 0.8207 - val_accuracy: 0.6490 - val_loss: 0.8070
Epoch 4/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6655 - loss: 0.8025 - val_accuracy: 0.6470 - val_loss: 0.7858
Epoch 5/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6740 - loss: 0.7966 - val_accuracy: 0.6863 - val_loss: 0.7826
Epoch 6/50
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6721 - loss: 0.7881 - val_accuracy: 0.6846 - val_loss: 0.7728
Epoch

# PCA

In [40]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
train_data_clinical = train_data.drop(['Pleural Effusion', 'Path'], axis=1)
feature_columns = train_data_clinical.columns  # Modify as needed

# Prepare the data
X_train, X_test, y_train, y_test = prepare_data(train_data, feature_columns, pca=True)

# Run XGBoost experiment
xgb_auc = run_xgboost_experiment(X_train, y_train, X_test, y_test)

# For ResNet, ensure you have the appropriate setup or use another suitable model
# resnet_auc = run_resnet_experiment(X_train, y_train, X_test, y_test)

print("XGBoost AUC:", xgb_auc)
# print("ResNet AUC:", resnet_auc)

Explained Variance Ratio: 0.8007593809582833


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost AUC: 0.8036028588339875


In [None]:
auc_score = train_predict_evaluate_resnet(X_train, y_train, X_test, y_test)
print("ResNet Model AUC Score:", auc_score)

In [None]:
# Define the subset of columns for an experiment
# Correct way to drop multiple columns
train_data_no_clinical = train_data.drop(['Pleural Effusion', 'Path','Right_Lung_Intensity', 'Left_Lung_Intensity', 'Asymmetry', ], axis=1)
feature_columns = train_data_no_clinical.columns  # Modify as needed

# Prepare the data
X_train, X_test, y_train, y_test = prepare_data(train_data, feature_columns, pca=True)

# Run XGBoost experiment
xgb_auc = run_xgboost_experiment(X_train, y_train, X_test, y_test)

print("XGBoost AUC:", xgb_auc)
# print("ResNet AUC:", resnet_auc)


In [None]:
auc_score = train_predict_evaluate_resnet(X_train, y_train, X_test, y_test)
print("ResNet Model AUC Score:", auc_score)