###Import necessary libraries

In [None]:
! pip install optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,os.path
import re
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from scipy.stats import bootstrap

In [None]:
import sklearn
import xgboost

print(f"Scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgboost.__version__}")


In [None]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.5.2

###Load the datasets

In [None]:
mtb = pd.read_excel("cc metabolites.xlsx")
mtb

In [None]:
mtb_cc = pd.read_excel("CC-validation mtb.xlsx")
mtb_cc

In [None]:
sample_column = mtb['Group']
data_to_scale = mtb.drop(columns=['Group'])

# Apply Min-Max scaling to the data (excluding the 'Group' column)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_to_scale)

# Convert the scaled data back into a DataFrame and reattach the 'Group' column
mtb = pd.DataFrame(data_scaled, columns=data_to_scale.columns)
mtb['Group'] = sample_column

# Rearrange the 'Group' column as the first column
mtb = mtb[['Group'] + [col for col in mtb.columns if col != 'Group']]

print("Data after Min-Max scaling:")
print(mtb)

In [None]:
sample_column = mtb_cc['Group']
data_to_scale = mtb_cc.drop(columns=['Group'])

# Apply Min-Max scaling to the data (excluding the 'Group' column)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_to_scale)

# Convert the scaled data back into a DataFrame and reattach the 'Group' column
mtb_cc = pd.DataFrame(data_scaled, columns=data_to_scale.columns)
mtb_cc['Group'] = sample_column

# Rearrange the 'Group' column as the first column
mtb_cc = mtb_cc[['Group'] + [col for col in mtb_cc.columns if col != 'Group']]


print("Data after Min-Max scaling:")
print(mtb_cc)

#XGBOOST

In [None]:
X = mtb.drop(['Group'], axis=1)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost classifier
model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the original test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test F1 Score: {f1:.4f}')
print(f'Test Recall: {recall:.4f}')

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 5),
    'min_child_weight': np.arange(1, 6)
}

# Initialize the XGBoost classifier
xgb = XGBClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)


random_search.fit(X_train, y_train)

# Get the best parameters from the random search
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the original test set
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

##Bayesian Optimization

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }

    # Initialize the XGBoost model with the suggested hyperparameters
    model = xgb.XGBClassifier(**params)

    # Evaluate using cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return cv_scores.mean()

# Create a study to maximize accuracy
study = optuna.create_study(direction='maximize')


# Enqueue the parameters obtained from previous RandomizedSearchCV results
study.enqueue_trial({
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'learning_rate': 0.20333333333333334,
    'gamma':0.125,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample':1.0,
    'colsample_bytree':1.0,
    'n_estimators':150
})

# Optimize the study using 50 trials
study.optimize(objective, n_trials=50)

# Print the best parameters and cross-validation accuracy
print(f"Best Parameters: {study.best_params}")
print(f"Best Cross-validation Accuracy: {study.best_value:.4f}")

# Train the final model with the best parameters on the training data
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Make predictions on the original test set
y_pred = final_model.predict(X_test)

# Evaluate the final model on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print test set performance metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")

In [None]:
selected_features = X_train.columns.tolist()

# Best parameters from Bayesian Optimization XGBoost
best_params_xg = {
    'learning_rate':0.0358184060673841,
    'max_depth': 6,
    'n_estimators': 206,
    'gamma':0.12114522580440171,
    'min_child_weight': 0.7315311842209224,
    'subsample': 0.8936632592369049,
    'colsample_bytree': 0.6296789767705571,
    'objective': 'binary:logistic',
    'booster': 'gbtree',
}

# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params_xg)

# Perform cross-validation
cv_scores = cross_val_score(final_model_xg, X_train[selected_features], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model
final_model_xg.fit(X_train[selected_features], y_train)

# Make predictions on the test set
y_pred_xg = final_model_xg.predict(X_test[selected_features])
y_pred_prob_xg = final_model_xg.predict_proba(X_test[selected_features])[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg, average = 'weighted')
recall = recall_score(y_test, y_pred_xg, average = 'weighted')
f1 = f1_score(y_test, y_pred_xg, average = 'weighted')
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##XGBoost Validation

In [None]:
X_val_cc = mtb_cc.drop('Group', axis=1)
y_val_cc = mtb_cc['Group']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_val_cc)

# Assuming you have the feature names used in training saved
missing_features = [feature for feature in selected_features if feature not in X_val_cc.columns]

# Add the missing features to the validation set with zero values
missing_df = pd.DataFrame(0.0, index=X_val_cc.index, columns=missing_features)
X_val_cc = pd.concat([X_val_cc, missing_df], axis=1)

# Ensure the columns are in the same order as the training features
X_val_cc = X_val_cc[selected_features]

#Make predictions on the validation set
y_pred_prob_xg_cc= final_model_xg.predict_proba(X_val_cc)[:, 1]
y_pred_xg_cc = final_model_xg.predict(X_val_cc)

# Use the encoded labels for all metrics
y_val_cc_encoded = label_encoder.transform(y_val_cc)

# Calculate evaluation metrics
accuracy_val = accuracy_score(y_val_cc_encoded, y_pred_xg_cc)
precision_val = precision_score(y_val_cc_encoded, y_pred_xg_cc, average="weighted", zero_division=1)
recall_val = recall_score(y_val_cc_encoded, y_pred_xg_cc, average="weighted")
f1_val = f1_score(y_val_cc_encoded, y_pred_xg_cc, average="weighted")
roc_auc_val = roc_auc_score(y_encoded, y_pred_prob_xg_cc)

# Calculate specificity
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_val_cc_encoded, y_pred_xg_cc).ravel()
specificity_val = tn_val / (tn_val + fp_val)

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val:.2f}')
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(f'Validation Precision: {precision_val:.2f}')
print(f'Validation Recall: {recall_val:.2f}')
print(f'Validation F1-Score: {f1_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')


##95% CI

In [None]:
n_bootstraps = 1000
def calculate_metrics(y_true, y_pred, y_pred_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=1)
    recall = recall_score(y_true, y_pred, average="weighted")
    f1 = f1_score(y_true, y_pred, average="weighted")
    roc_auc = roc_auc_score(y_true, y_pred_prob)
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp)
    else:
        specificity = np.nan
    return accuracy, precision, recall, f1, roc_auc, specificity

# Arrays to store bootstrap results for each metric
bootstrapped_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
    "roc_auc": [],
    "specificity": []
}

# Bootstrapping loop
for _ in range(n_bootstraps):
    # Resample indices with replacement
    indices = np.random.choice(range(len(y_encoded_cc)), len(y_encoded_cc), replace=True)
    y_true_resampled = y_encoded_cc[indices]
    y_pred_resampled = y_pred_xg_cc[indices]
    y_pred_prob_resampled = y_pred_prob_xg_cc[indices]

    # Calculate metrics on resampled data
    metrics = calculate_metrics(y_true_resampled, y_pred_resampled, y_pred_prob_resampled)
    for metric_name, metric_value in zip(bootstrapped_metrics.keys(), metrics):
        bootstrapped_metrics[metric_name].append(metric_value)

# Calculate 95% confidence intervals for each metric
conf_intervals = {metric: (np.percentile(values, 2.5), np.percentile(values, 97.5))
                  for metric, values in bootstrapped_metrics.items()}

# Print evaluation metrics with 95% confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f} (95% CI: {conf_intervals["roc_auc"]})')
print(f'Validation Accuracy: {accuracy_val_xg:.2f} (95% CI: {conf_intervals["accuracy"]})')
print(f'Validation Precision: {precision_val_xg:.2f} (95% CI: {conf_intervals["precision"]})')
print(f'Validation Recall: {recall_val_xg:.2f} (95% CI: {conf_intervals["recall"]})')
print(f'Validation F1-Score: {f1_val_xg:.2f} (95% CI: {conf_intervals["f1"]})')
print(f'Validation Specificity: {specificity_val_xg:.2f} (95% CI: {conf_intervals["specificity"]})')

#RANDOM FOREST

In [None]:
X = mtb.drop(['Group'], axis=1)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Calculate F1 score on the test set
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Test F1 Score: {test_f1:.2f}")

# Calculate precision on the test set
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f"Test Precision: {test_precision:.2f}")

# Calculate recall on the test set
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f"Test Recall: {test_recall:.2f}")



##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'rf__n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__max_depth': [int(x) for x in np.linspace(10, 300, num=20)] + [None],
    'rf__min_samples_split': [2, 5, 10, 15],
    'rf__min_samples_leaf': [1, 2, 4, 6],
    'rf__bootstrap': [True, False]
}

# Initialize the pipeline:Random Forest
pipeline = Pipeline([
    ('rf', RandomForestClassifier(random_state=42))
])

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist,
                               n_iter=100, cv=StratifiedKFold(5), verbose=2,
                               random_state=42, n_jobs=-1, scoring='roc_auc')

# Fit RandomizedSearchCV to the original training data
rf_random.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(rf_random.best_params_)

# Predict on the original test data
y_pred = rf_random.best_estimator_.predict(X_test)
y_prob = rf_random.best_estimator_.predict_proba(X_test)[:, 1]

# Evaluate the model with default threshold (0.5)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_prob)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")


##Bayesian Optimization

In [None]:
# Define the best parameters found from RandomizedSearchCV RF
best_params_random = {
    'n_estimators': 600,
    'min_samples_split': 5,
    'min_samples_leaf':2,
    'max_features': 'sqrt',
    'max_depth': 132,
    'bootstrap': True,
    'class_weight': 'balanced'
}

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', max(100, best_params_random['n_estimators'] - 200), best_params_random['n_estimators'] + 200)
    min_samples_split = trial.suggest_int('min_samples_split', max(2, best_params_random['min_samples_split'] - 3), best_params_random['min_samples_split'] + 3)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', max(1, best_params_random['min_samples_leaf'] - 2), best_params_random['min_samples_leaf'] + 2)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 55, 132, 300])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0, 0.01)

    # Initialize RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_depth=max_depth,
        bootstrap=bootstrap,
        class_weight=class_weight,
        criterion=criterion,
        min_impurity_decrease=min_impurity_decrease,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(cv_scores)

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')

# Enqueue the trial with the best parameters from RandomizedSearchCV
study.enqueue_trial(best_params_random)

study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)


# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = RandomForestClassifier(**best_params_optuna, random_state=50)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
y_prob = best_clf.predict_proba(X_test)[:, 1]

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")


In [None]:
selected_features_rf = X_train.columns.tolist()

# Best parameters from Bayesian Optimization RF
best_params_rf = {
    'n_estimators': 580,
    'min_samples_split': 4,
    'min_samples_leaf': 1,
    'max_features': 'log2',
    'bootstrap': True,
    'criterion': 'entropy',
    'max_depth': 223,
    'min_impurity_decrease':0.0010229614656723429
}

## Create the Random Forest classifier with the best parameters
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=50)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_rf, X_train[selected_features_rf], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the final model on the entire training data
final_model_rf.fit(X_train[selected_features_rf], y_train)

# Make predictions on the test set
y_pred_rf = final_model_rf.predict(X_test[selected_features_rf])
y_pred_prob_rf = final_model_rf.predict_proba(X_test[selected_features_rf])[:, 1]

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average = 'weighted')
recall = recall_score(y_test, y_pred_rf, average = 'weighted')
f1 = f1_score(y_test, y_pred_rf, average = 'weighted')
auc_roc = roc_auc_score(y_test, y_pred_prob_rf)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
specificity = tn / (tn + fp)

# Print the results for test data
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test AUC-ROC: {auc_roc:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##Random Forest Validation

In [None]:
# Validation set
X_val_cc_rf = mtb_cc.drop('Group', axis=1)
y_val_cc_rf = mtb_cc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_val_cc_rf)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_rf if feature not in X_val_cc_rf.columns]

# Add the missing features to the validation set with zero values
missing_features = [feature for feature in selected_features_rf if feature not in X_val_cc_rf.columns]
missing_df_rf = pd.DataFrame(0, index=X_val_cc_rf.index, columns=missing_features)
X_val_cc_rf = pd.concat([X_val_cc_rf, missing_df_rf], axis=1)

# Now, reorder columns in X_val_cc_rf to match selected_features_rf
X_val_cc_rf = X_val_cc_rf[selected_features_rf]

# Make predictions on the validation set
y_pred_prob_rf_cc = final_model_rf.predict_proba(X_val_cc_rf)[:, 1]
y_pred_rf_cc = final_model_rf.predict(X_val_cc_rf)

# Calculate evaluation metrics
accuracy_val = accuracy_score(y_encoded, y_pred_rf_cc)
precision_val = precision_score(y_encoded, y_pred_rf_cc, average='weighted', zero_division=1)
recall_val = recall_score(y_encoded, y_pred_rf_cc, average='weighted')
f1_val = f1_score(y_encoded, y_pred_rf_cc, average='weighted')
roc_auc_val = roc_auc_score(y_encoded, y_pred_prob_rf_cc)

# Calculate specificity
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_encoded, y_pred_rf_cc).ravel()
specificity_val = tn_val / (tn_val + fp_val)

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val:.2f}')
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(f'Validation Precision: {precision_val:.2f}')
print(f'Validation Recall: {recall_val:.2f}')
print(f'Validation F1-Score: {f1_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')

##95% CI

In [None]:
# Function to compute metrics
def compute_metrics(y_true, y_pred, y_pred_proba):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # Confusion matrix to compute specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)

    return accuracy, precision, recall, f1, roc_auc, specificity

# Number of bootstrap iterations
n_iterations = 1000
n_size = len(X_val_cc_rf)

# Initialize lists to store metric values for each bootstrap sample
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
specificity_scores = []

# Bootstrap procedure
for i in range(n_iterations):
    # Resample the validation set with replacement
    X_resample, y_resample = resample(X_val_cc_rf, y_encoded, n_samples=n_size, random_state=i)

    # Make predictions on the resampled data
    y_pred_resample = final_model_rf.predict(X_resample)
    y_pred_proba_resample = final_model_rf.predict_proba(X_resample)[:, 1]

    # Calculate metrics for this bootstrap sample
    accuracy, precision, recall, f1, roc_auc, specificity = compute_metrics(y_resample, y_pred_resample, y_pred_proba_resample)

    # Store the metrics
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)
    specificity_scores.append(specificity)

# Calculate 95% confidence intervals for each metric
def calculate_confidence_interval(scores):
    lower_bound = np.percentile(scores, 2.5)
    upper_bound = np.percentile(scores, 97.5)
    return lower_bound, upper_bound

# Calculate and print 95% confidence intervals
accuracy_ci = calculate_confidence_interval(accuracy_scores)
precision_ci = calculate_confidence_interval(precision_scores)
recall_ci = calculate_confidence_interval(recall_scores)
f1_ci = calculate_confidence_interval(f1_scores)
roc_auc_ci = calculate_confidence_interval(roc_auc_scores)
specificity_ci = calculate_confidence_interval(specificity_scores)

print(f'Validation ROC AUC: {roc_auc_val:.2f}, 95% CI: [{roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f}]')
print(f'Validation Accuracy: {accuracy_val:.2f}, 95% CI: [{accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f}]')
print(f'Validation Precision: {precision_val:.2f}, 95% CI: [{precision_ci[0]:.2f}, {precision_ci[1]:.2f}]')
print(f'Validation Recall: {recall_val:.2f}, 95% CI: [{recall_ci[0]:.2f}, {recall_ci[1]:.2f}]')
print(f'Validation F1-Score: {f1_val:.2f}, 95% CI: [{f1_ci[0]:.2f}, {f1_ci[1]:.2f}]')
print(f'Validation Specificity: {specificity_val:.2f}, 95% CI: [{specificity_ci[0]:.2f}, {specificity_ci[1]:.2f}]')


#LASSO

In [None]:
X = mtb.drop(['Group'], axis=1)

# Target variable (y)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets (25% test, 75% train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Train the classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear'],
    'max_iter': [1000, 5000, 10000, 20000],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', random_state=42)

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    log_reg, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the Randomized Search model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Bayesian Optimization

In [None]:
# Define fixed parameters from prior search
random_search_params = {'tol': 0.01, 'solver': 'liblinear', 'max_iter': 20000, 'C': 29.763514416313132}

def objective(trial):
    C = trial.suggest_float('C', 1e-4, 2000.0, log=True)
    max_iter = trial.suggest_int('max_iter', 1000, 500000)
    tol = trial.suggest_float('tol', 1e-5, 1e3, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)

        clf = LogisticRegression(
            penalty='l1', C=C, max_iter=max_iter, tol=tol, solver=solver, random_state=42
        )
        score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()

    return score

# Set up Optuna study
study = optuna.create_study(direction='maximize')

# Enqueue initial trial with prior parameters
study.enqueue_trial(random_search_params)

# Run optimization
study.optimize(objective, n_trials=50)

# Retrieve the best parameters and score
print("Best Parameters from Optuna:", study.best_params)
print("Best AUC-ROC Score from Optuna:", study.best_value)

# Train the best model
best_params = study.best_params
clf = LogisticRegression(
    penalty='l1', **best_params, random_state=42
)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred_prob = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

# Evaluation metrics on test set
roc_auc = roc_auc_score(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation results
print(f"Test AUC-ROC: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

In [None]:
selected_features_lasso = X_train.columns.tolist()

# Best parameters from Bayesian Optimization with Optuna
best_params_lasso = {
    'C': 669.9618437778672,
    'max_iter': 468788,
    'solver': 'saga',
    'tol': 0.6106464567815167
}

# Train the Logistic Regression model again using only the selected features
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)


# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_lasso, X_train[selected_features_lasso], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

final_model_lasso.fit(X_train[selected_features_lasso], y_train)

# Make predictions on the test set (predicted probabilities)
y_pred_proba_lasso = final_model_lasso.predict_proba(X_test[selected_features_lasso])[:, 1]

# Convert probabilities to predicted class labels
y_pred_lasso = final_model_lasso.predict(X_test[selected_features_lasso])

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_lasso)
precision = precision_score(y_test, y_pred_lasso, average = 'weighted')
recall = recall_score(y_test, y_pred_lasso, average = 'weighted')
f1 = f1_score(y_test, y_pred_lasso, average = 'weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba_lasso)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')



##LASSO Validation

In [None]:
# Validation set
X_val_cc_lasso = mtb_cc.drop('Group', axis=1)
y_val_cc_lasso = mtb_cc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_val_cc_lasso)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_lasso if feature not in X_val_cc_lasso.columns]

# Add the missing features to the validation set with zero values
missing_df_lasso = pd.DataFrame(0, index=X_val_cc_lasso.index, columns=missing_features)
X_val_cc_lasso = pd.concat([X_val_cc_lasso, missing_df_lasso], axis=1)

# Ensure the columns are in the same order as the training features
X_val_cc_lasso = X_val_cc_lasso[selected_features_lasso]

# Apply imputation to handle any remaining NaN values while keeping it a DataFrame
imputer = SimpleImputer(strategy='mean')
X_val_cc_lasso_imputed = pd.DataFrame(imputer.fit_transform(X_val_cc_lasso), columns=X_val_cc_lasso.columns)

# Make predictions on the validation set
y_pred_prob_lasso_cc = final_model_lasso.predict_proba(X_val_cc_lasso_imputed)[:, 1]
y_pred_lasso_cc = final_model_lasso.predict(X_val_cc_lasso_imputed)

# Calculate evaluation metrics
accuracy_val = accuracy_score(y_encoded, y_pred_lasso_cc)
precision_val = precision_score(y_encoded, y_pred_lasso_cc, average="weighted")
recall_val = recall_score(y_encoded, y_pred_lasso_cc, average="weighted")
f1_val = f1_score(y_encoded, y_pred_lasso_cc, average="weighted")
roc_auc_val = roc_auc_score(y_encoded, y_pred_prob_lasso_cc)

# Calculate specificity
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_encoded, y_pred_lasso_cc).ravel()
specificity_val = tn_val / (tn_val + fp_val)

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val:.2f}')
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(f'Validation Precision: {precision_val:.2f}')
print(f'Validation Recall: {recall_val:.2f}')
print(f'Validation F1-Score: {f1_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')



##95% CI

In [None]:
# Set the number of bootstrap samples
n_bootstraps = 1000
rng = np.random.RandomState(42)

# Initialize lists to store bootstrapped metric values
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
specificity_scores = []

# Bootstrap loop
for _ in range(n_bootstraps):
    # Resample the validation set with replacement
    indices = rng.choice(len(X_val_cc_lasso), len(X_val_cc_lasso), replace=True)
    X_bootstrap = X_val_cc_lasso.iloc[indices]
    y_bootstrap = y_encoded_cc_lasso[indices]

    y_pred = lasso_model.predict(X_bootstrap)

    # Calculate metrics for model predictions
    accuracy_scores.append(accuracy_score(y_bootstrap, y_pred))
    precision_scores.append(precision_score(y_bootstrap, y_pred, average="weighted", zero_division=1))
    recall_scores.append(recall_score(y_bootstrap, y_pred, average="weighted"))
    f1_scores.append(f1_score(y_bootstrap, y_pred, average="weighted"))

    # Original model predictions for ROC AUC and specificity
    y_pred_prob = final_model_lasso.predict_proba(X_bootstrap)[:, 1]
    y_pred_original = final_model_lasso.predict(X_bootstrap)

    # ROC AUC calculation
    roc_auc_scores.append(roc_auc_score(y_bootstrap, y_pred_prob))

    # Specificity calculation
    conf_matrix = confusion_matrix(y_bootstrap, y_pred_original)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp)
    else:
        specificity = np.nan

    specificity_scores.append(specificity)

# Define a helper function to calculate the 95% CI
def ci_bootstrap(metric_list):
    lower = np.percentile(metric_list, 2.5)
    upper = np.percentile(metric_list, 97.5)
    return lower, upper

# Calculate 95% CI for each metric
accuracy_ci = ci_bootstrap(accuracy_scores)
precision_ci = ci_bootstrap(precision_scores)
recall_ci = ci_bootstrap(recall_scores)
f1_ci = ci_bootstrap(f1_scores)
roc_auc_ci = ci_bootstrap(roc_auc_scores)
specificity_ci = ci_bootstrap([s for s in specificity_scores if not np.isnan(s)])

# Print metrics with 95% CI
print(f'Validation ROC AUC: {roc_auc_val_lasso:.2f} (95% CI: {roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_lasso:.2f} (95% CI: {specificity_ci[0]:.2f}, {specificity_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_lasso:.2f} (95% CI: {accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_lasso:.2f} (95% CI: {precision_ci[0]:.2f}, {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_lasso:.2f} (95% CI: {recall_ci[0]:.2f}, {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_lasso:.2f} (95% CI: {f1_ci[0]:.2f}, {f1_ci[1]:.2f})')