###Import necessary libraries

In [None]:
! pip install optuna

In [None]:
import sklearn
import xgboost

print(f"Scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgboost.__version__}")


In [None]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.5.2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,os.path
import re
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from scipy.stats import bootstrap
from sklearn.preprocessing import MinMaxScaler

###Load the datasets

In [None]:
gc = pd.read_excel('GC biomarkers reduced.xlsx')
gc

In [None]:
cc = pd.read_excel('GC biomarkers in CC.xlsx')
cc

In [None]:
ibd = pd.read_excel('GC biomarkers in IBD.xlsx')
ibd

In [None]:
# Select all columns except 'group'
features = gc.drop(columns=['Group'])

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the selected features
scaled_features = scaler.fit_transform(features)

# Replace the original feature columns with the scaled ones
gc = gc.copy()
gc.loc[:, features.columns] = scaled_features

# Display the first few rows of the scaled DataFrame
print(gc.head())


In [None]:
# Select all columns except 'group'
features = cc.drop(columns=['Group'])

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the selected features
scaled_features = scaler.fit_transform(features)

# Replace the original feature columns with the scaled ones
cc = cc.copy()
cc.loc[:, features.columns] = scaled_features

# Display the first few rows of the scaled DataFrame
print(cc.head())


In [None]:
# Select all columns except 'group'
features = ibd.drop(columns=['Group'])

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the selected features
scaled_features = scaler.fit_transform(features)

# Replace the original feature columns with the scaled ones
ibd = ibd.copy()
ibd.loc[:, features.columns] = scaled_features

# Display the first few rows of the scaled DataFrame
print(ibd.head())


#XGBoost

In [None]:
X = gc.drop(['Group'], axis=1)
y = gc['Group']
# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost classifier with default parameters
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy:.4f}')
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 5),
    'min_child_weight': np.arange(1, 6),
    'reg_alpha': np.linspace(0, 1, 20),
    'reg_lambda': np.linspace(0, 1, 20),
}

# Initialize the model
xgb = XGBClassifier()

# Randomized search
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set
y_pred = random_search.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


##Bayesian Optimization

In [None]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }

    model = xgb.XGBClassifier(**params)

    # Use cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return cv_scores.mean()

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')


# Enqueue the parameters obtained from RandomizedSearchCV
study.enqueue_trial({
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'learning_rate': 0.10666666666666666,
    'gamma':  0.375,
    'max_depth': 3,
    'min_child_weight': 2,
    'subsample': 0.5,
    'colsample_bytree': 0.6111111111111112 ,
    'n_estimators': 130
})

# Optimize the study
study.optimize(objective, n_trials=50)

# Print the best parameters and the best score
print(f"Best Parameters: {study.best_params}")


# Train the final model with the best parameters
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")


In [None]:
selected_features = X_train.columns.tolist()

best_params_xg = {
    'learning_rate':   0.216735435271517,
    'max_depth': 5,
    'n_estimators': 118,
    'gamma': 0.2985469943655731,
    'min_child_weight': 0.5958575235613429,
    'subsample': 0.5600864058115105,
    'colsample_bytree': 0.737265031590481,
    'objective': 'binary:logistic',
    'booster': 'gbtree'
}

# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params_xg, random_state = 42)

# Perform cross-validation
cv_scores = cross_val_score(final_model_xg, X_train[selected_features], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model
final_model_xg.fit(X_train[selected_features], y_train)

# Make predictions on the test set
y_pred_xg = final_model_xg.predict(X_test[selected_features])
y_pred_prob_xg = final_model_xg.predict_proba(X_test[selected_features])[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg, average="weighted", zero_division=1)
recall = recall_score(y_test, y_pred_xg)
f1 = f1_score(y_test, y_pred_xg)
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##95% CI

In [None]:
# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params_xg,random_state =42)

# Perform cross-validation
cv_scores = cross_val_score(final_model_xg, X_train[selected_features], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model
final_model_xg.fit(X_train[selected_features], y_train)

# Make predictions on the test set
y_pred_xg = final_model_xg.predict(X_test[selected_features])
y_pred_prob_xg = final_model_xg.predict_proba(X_test[selected_features])[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg, average="weighted", zero_division=1)
recall = recall_score(y_test, y_pred_xg)
f1 = f1_score(y_test, y_pred_xg)
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Bootstrap function to calculate 95% CI
def bootstrap_metric(metric_func, y_true, y_pred, y_pred_prob=None, n_bootstraps=1000):
    bootstrapped_scores = []
    n_size = len(y_true)

    for i in range(n_bootstraps):
        # Resample with replacement
        indices = resample(range(n_size), replace=True, n_samples=n_size)
        y_true_resampled = np.array(y_true)[indices]
        y_pred_resampled = np.array(y_pred)[indices]

        if metric_func == roc_auc_score:
            y_pred_prob_resampled = np.array(y_pred_prob)[indices]
            score = metric_func(y_true_resampled, y_pred_prob_resampled)
        elif metric_func in [accuracy_score, precision_score, recall_score, f1_score]:
            score = metric_func(y_true_resampled, y_pred_resampled)
        elif metric_func == "specificity":
            tn, fp, fn, tp = confusion_matrix(y_true_resampled, y_pred_resampled).ravel()
            score = tn / (tn + fp)

        bootstrapped_scores.append(score)

    # Calculate 95% CI
    lower = np.percentile(bootstrapped_scores, 2.5)
    upper = np.percentile(bootstrapped_scores, 97.5)

    return lower, upper

# Calculate 95% CI for each metric
accuracy_ci = bootstrap_metric(accuracy_score, y_test, y_pred_xg)
precision_ci = bootstrap_metric(precision_score, y_test, y_pred_xg)
recall_ci = bootstrap_metric(recall_score, y_test, y_pred_xg)
f1_ci = bootstrap_metric(f1_score, y_test, y_pred_xg)
roc_auc_ci = bootstrap_metric(roc_auc_score, y_test, y_pred_xg, y_pred_prob=y_pred_prob_xg)
specificity_ci = bootstrap_metric("specificity", y_test, y_pred_xg)

# Print the results with 95% CIs
print(f'Test ROC AUC: {roc_auc:.2f} (95% CI: {roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f})')
print(f'Test Accuracy: {accuracy:.2f} (95% CI: {accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f})')
print(f'Test Precision: {precision:.2f} (95% CI: {precision_ci[0]:.2f}, {precision_ci[1]:.2f})')
print(f'Test Recall: {recall:.2f} (95% CI: {recall_ci[0]:.2f}, {recall_ci[1]:.2f})')
print(f'Test F1-Score: {f1:.2f} (95% CI: {f1_ci[0]:.2f}, {f1_ci[1]:.2f})')
print(f'Test Specificity: {specificity:.2f} (95% CI: {specificity_ci[0]:.2f}, {specificity_ci[1]:.2f})')


##Prediction on IBD

In [None]:
X_val_ibd = ibd.drop('Group', axis=1)
y_val_ibd = ibd['Group']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded_ibd = label_encoder.fit_transform(y_val_ibd)

# Identify missing features in the validation set
missing_features = [feature for feature in selected_features if feature not in X_val_ibd.columns]

# Add missing features with zero values
missing_df = pd.DataFrame(0.0, index=X_val_ibd.index, columns=missing_features)
X_val_ibd = pd.concat([X_val_ibd, missing_df], axis=1)

# Ensure the columns are in the same order as the training features
X_val_ibd = X_val_ibd[selected_features]
xg_model.fit(X_val_ibd, y_encoded_ibd)

# Get predicted probabilities and class predictions using model
y_pred_prob_xg_ibd = xg_model.predict_proba(X_val_ibd)[:, 1]
y_pred_xg_ibd = xg_model.predict(X_val_ibd)

# Calculate ROC AUC and other metrics
roc_auc_val_xg = roc_auc_score(y_encoded_ibd, y_pred_prob_xg_ibd)
accuracy_val_xg = accuracy_score(y_encoded_ibd, y_pred_xg_ibd)
precision_val_xg = precision_score(y_encoded_ibd, y_pred_xg_ibd, average="weighted", zero_division=1)
recall_val_xg = recall_score(y_encoded_ibd, y_pred_xg_ibd, average="weighted")
f1_val_xg = f1_score(y_encoded_ibd, y_pred_xg_ibd, average="weighted")

# Calculate specificity
y_pred_raw_xg_ibd = final_model_xg.predict(X_val_ibd)
conf_matrix_raw = confusion_matrix(y_encoded_ibd, y_pred_raw_xg_ibd)

# Compute specificity
if conf_matrix_raw.shape == (2, 2):
    tn_val_xg, fp_val_xg, fn_val_xg, tp_val_xg = conf_matrix_raw.ravel()
    specificity_val_xg = tn_val_xg / (tn_val_xg + fp_val_xg)
else:
    specificity_val_xg = 'N/A'

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f}')
print(f'Validation Accuracy: {accuracy_val_xg:.2f}')
print(f'Validation Precision: {precision_val_xg:.2f}')
print(f'Validation Recall: {recall_val_xg:.2f}')
print(f'Validation F1-Score: {f1_val_xg:.2f}')
print(f'Validation Specificity: {specificity_val_xg:.2f}')


##95% CI

In [None]:
def bootstrap_confidence_interval(y_true, y_pred, y_pred_prob, metric_func, n_bootstraps=1000, alpha=0.95):
    bootstrapped_scores = []
    n_size = len(y_true)

    for i in range(n_bootstraps):
        # Sample with replacement from the data
        indices = resample(np.arange(n_size), replace=True, n_samples=n_size)
        if metric_func == roc_auc_score:
            score = metric_func(y_true[indices], y_pred_prob[indices])
        else:
            score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    # Calculate the confidence interval
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

# Number of bootstrap samples and alpha for 95% CI
n_bootstraps = 1000
alpha = 0.95

# Calculate 95% confidence intervals for the metrics
roc_auc_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, roc_auc_score, n_bootstraps, alpha)
accuracy_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, accuracy_score, n_bootstraps, alpha)
precision_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, precision_score, n_bootstraps, alpha)
recall_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, recall_score, n_bootstraps, alpha)
f1_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, f1_score, n_bootstraps, alpha)

# Calculate specificity CI
def calculate_specificity(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        return tn / (tn + fp)
    else:
        return None
specificity_ci = bootstrap_confidence_interval(y_encoded_ibd, y_pred_xg_ibd, y_pred_prob_xg_ibd, calculate_specificity, n_bootstraps, alpha)

# Print the metrics with 95% confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f} (95% CI: {roc_auc_ci[0]:.2f} - {roc_auc_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_xg:.2f} (95% CI: {accuracy_ci[0]:.2f} - {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_xg:.2f} (95% CI: {precision_ci[0]:.2f} - {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_xg:.2f} (95% CI: {recall_ci[0]:.2f} - {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_xg:.2f} (95% CI: {f1_ci[0]:.2f} - {f1_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_xg:.2f} (95% CI: {specificity_ci[0]:.2f} - {specificity_ci[1]:.2f})')


##Prediction on CRC

In [None]:
X_val_cc = cc.drop('Group', axis=1)
y_val_cc = cc['Group']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded_cc = label_encoder.fit_transform(y_val_cc)

# Identify missing features in the validation set
missing_features = [feature for feature in selected_features if feature not in X_val_cc.columns]

# Add missing features with zero values
missing_df = pd.DataFrame(0.0, index=X_val_cc.index, columns=missing_features)
X_val_cc = pd.concat([X_val_cc, missing_df], axis=1)

# Ensure the columns are in the same order as the training features
X_val_cc = X_val_cc[selected_features]
xg_model.fit(X_val_cc, y_encoded_cc)

# Get predicted probabilities and class predictions
y_pred_prob_xg_cc = xg_model.predict_proba(X_val_cc)[:, 1]
y_pred_xg_cc = xg_model.predict(X_val_cc)

# Calculate ROC AUC and other metrics
roc_auc_val_xg = roc_auc_score(y_encoded_cc, y_pred_prob_xg_cc)
accuracy_val_xg = accuracy_score(y_encoded_cc, y_pred_xg_cc)
precision_val_xg = precision_score(y_encoded_cc, y_pred_xg_cc, average="weighted", zero_division=1)
recall_val_xg = recall_score(y_encoded_cc, y_pred_xg_cc, average="weighted")
f1_val_xg = f1_score(y_encoded_cc, y_pred_xg_cc, average="weighted")

# Calculate specificity
y_pred_raw_xg_cc = final_model_xg.predict(X_val_cc)
conf_matrix_raw = confusion_matrix(y_encoded_cc, y_pred_raw_xg_cc)

# Compute specificity
if conf_matrix_raw.shape == (2, 2):
    tn_val_xg, fp_val_xg, fn_val_xg, tp_val_xg = conf_matrix_raw.ravel()
    specificity_val_xg = tn_val_xg / (tn_val_xg + fp_val_xg)
else:
    specificity_val_xg = 'N/A'

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f}')
print(f'Validation Accuracy: {accuracy_val_xg:.2f}')
print(f'Validation Precision: {precision_val_xg:.2f}')
print(f'Validation Recall: {recall_val_xg:.2f}')
print(f'Validation F1-Score: {f1_val_xg:.2f}')
print(f'Validation Specificity: {specificity_val_xg:.2f}')

##95% CI

In [None]:
# Define a function to calculate bootstrap confidence intervals
def bootstrap_ci(y_true, y_pred, y_prob, metric_func, n_bootstrap=1000, ci=95):
    scores = []
    for _ in range(n_bootstrap):
        # Resample data with replacement
        indices = resample(range(len(y_true)), replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        y_prob_boot = y_prob[indices]

        # Calculate the metric on the bootstrap sample
        score = metric_func(y_true_boot, y_pred_boot, y_prob_boot)
        scores.append(score)

    # Compute the confidence interval
    lower_bound = np.percentile(scores, (100 - ci) / 2)
    upper_bound = np.percentile(scores, 100 - (100 - ci) / 2)
    return lower_bound, upper_bound

# Define metric functions
def roc_auc_func(y_true, y_pred, y_prob):
    return roc_auc_score(y_true, y_prob)

def accuracy_func(y_true, y_pred, y_prob):
    return accuracy_score(y_true, y_pred)

def precision_func(y_true, y_pred, y_prob):
    return precision_score(y_true, y_pred, average="weighted", zero_division=1)

def recall_func(y_true, y_pred, y_prob):
    return recall_score(y_true, y_pred, average="weighted")

def f1_func(y_true, y_pred, y_prob):
    return f1_score(y_true, y_pred, average="weighted")

# Convert to numpy arrays for indexing
y_true_array = np.array(y_encoded_cc)
y_pred_array = np.array(y_pred_xg_cc)
y_prob_array = np.array(y_pred_prob_xg_cc)

# Calculate metrics with 95% CI
roc_auc_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, roc_auc_func)
accuracy_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, accuracy_func)
precision_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, precision_func)
recall_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, recall_func)
f1_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, f1_func)
specificity_ci = bootstrap_ci(y_true_array, y_pred_array, y_prob_array, lambda y_true, y_pred, y_prob: specificity_val_xg)

# Print results with confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f} (95% CI: {roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_xg:.2f} (95% CI: {accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_xg:.2f} (95% CI: {precision_ci[0]:.2f}, {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_xg:.2f} (95% CI: {recall_ci[0]:.2f}, {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_xg:.2f} (95% CI: {f1_ci[0]:.2f}, {f1_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_xg:.2f} (95% CI: {specificity_ci[0]:.2f}, {specificity_ci[1]:.2f})')


#Random Forest

In [None]:
X = gc.drop(['Group'], axis=1)
y = gc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Calculate F1 score on the test set
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Test F1 Score: {test_f1:.4f}")

# Calculate precision on the test set
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f"Test Precision: {test_precision:.4f}")

# Calculate recall on the test set
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f"Test Recall: {test_recall:.4f}")


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                               n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV to the data
rf_random.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(rf_random.best_params_)

# Predict on the test data
y_pred = rf_random.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


##Bayesian Optimization

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the best parameters found from RandomizedSearchCV RF
best_params_random = {
    'n_estimators': 700,
    'min_samples_split': 5,
    'min_samples_leaf': 4,
    'max_features': 'log2',
    'max_depth': 80,
    'bootstrap': True
}

def objective(trial):
    # Define the search space based on the best parameters from RandomizedSearchCV
    n_estimators = trial.suggest_int('n_estimators', max(100, best_params_random['n_estimators'] - 200), best_params_random['n_estimators'] + 200)
    min_samples_split = trial.suggest_int('min_samples_split', max(2, best_params_random['min_samples_split'] - 3), best_params_random['min_samples_split'] + 3)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', max(1, best_params_random['min_samples_leaf'] - 2), best_params_random['min_samples_leaf'] + 2)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_int('max_depth', max(5, best_params_random['max_depth'] - 10), best_params_random['max_depth'] + 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Initialize RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_depth=max_depth,
        bootstrap=bootstrap,  # Fixed from best_params_random
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    cv_scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    return np.mean(cv_scores)

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')

# Enqueue the trial with the best parameters from RandomizedSearchCV
study.enqueue_trial(best_params_random)

study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = RandomForestClassifier(**best_params_optuna, random_state=50)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

selected_features_rf = X_train.columns.tolist()

# Best parameters from Bayesian Optimization RF
best_params_rf = {
    'n_estimators': 612,
    'min_samples_split': 6,
    'min_samples_leaf': 3,
    'max_features': 'log2',
    'max_depth': 84,
    'bootstrap': False
}


# Create the Random Forest classifier with the best parameters
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=42)


# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_rf, X_train[selected_features_rf], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the final model on the entire training data
final_model_rf.fit(X_train[selected_features_rf], y_train)

# Make predictions on the test set
y_pred_rf = final_model_rf.predict(X_test[selected_features_rf])
y_pred_prob_rf = final_model_rf.predict_proba(X_test[selected_features_rf])[:, 1]

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')
f1 = f1_score(y_test, y_pred_rf, average='weighted')
auc_roc = roc_auc_score(y_test, y_pred_prob_rf)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
specificity = tn / (tn + fp)

# Print the results for test data
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test AUC-ROC: {auc_roc:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##95% CI

In [None]:
def bootstrap_metric(y_true, y_pred, y_pred_prob, n_iterations=1000, alpha=0.95):
    metrics = {'accuracy': [],
               'precision': [],
               'recall': [],
               'f1': [],
               'roc_auc': [],
               'specificity': []}

    n_size = len(y_true)

    for i in range(n_iterations):
        # Bootstrap sample
        indices = resample(np.arange(n_size), n_samples=n_size, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        y_pred_prob_boot = y_pred_prob[indices]

        # Calculate metrics
        accuracy = accuracy_score(y_true_boot, y_pred_boot)
        precision = precision_score(y_true_boot, y_pred_boot)
        recall = recall_score(y_true_boot, y_pred_boot)
        f1 = f1_score(y_true_boot, y_pred_boot)
        roc_auc = roc_auc_score(y_true_boot, y_pred_prob_boot)

        tn, fp, fn, tp = confusion_matrix(y_true_boot, y_pred_boot).ravel()
        specificity = tn / (tn + fp)

        # Store results
        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['roc_auc'].append(roc_auc)
        metrics['specificity'].append(specificity)

    # Calculate confidence intervals
    ci = {}
    for metric in metrics:
        lower = np.percentile(metrics[metric], (1 - alpha) / 2 * 100)
        upper = np.percentile(metrics[metric], (1 + alpha) / 2 * 100)
        ci[metric] = (lower, upper)

    return ci
y_test = np.array(y_test)
y_pred_rf = np.array(y_pred_rf)
y_pred_prob_rf = np.array(y_pred_prob_rf)

# Calculate metrics on the original data
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
roc_auc = roc_auc_score(y_test, y_pred_prob_rf)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
specificity = tn / (tn + fp)

# Calculate confidence intervals
ci = bootstrap_metric(y_test, y_pred_rf, y_pred_prob_rf, n_iterations=1000, alpha=0.95)

# Print scores and confidence intervals
print(f'Test ROC AUC: {roc_auc:.2f} (95% CI: {ci["roc_auc"][0]:.2f}, {ci["roc_auc"][1]:.2f})')
print(f'Test Accuracy: {accuracy:.2f} (95% CI: {ci["accuracy"][0]:.2f}, {ci["accuracy"][1]:.2f})')
print(f'Test Precision: {precision:.2f} (95% CI: {ci["precision"][0]:.2f}, {ci["precision"][1]:.2f})')
print(f'Test Recall: {recall:.2f} (95% CI: {ci["recall"][0]:.2f}, {ci["recall"][1]:.2f})')
print(f'Test F1-Score: {f1:.2f} (95% CI: {ci["f1"][0]:.2f}, {ci["f1"][1]:.2f})')
print(f'Test Specificity: {specificity:.2f} (95% CI: {ci["specificity"][0]:.2f}, {ci["specificity"][1]:.2f})')

##Prediction on IBD

In [None]:
X_val_ibd_rf = ibd.drop('Group', axis=1)
y_val_ibd_rf = ibd['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded_ibd_rf = label_encoder.fit_transform(y_val_ibd_rf)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_rf if feature not in X_val_ibd_rf.columns]

# Add the missing features to the validation set with zero values using pd.concat
missing_df_rf = pd.DataFrame(0, index=X_val_ibd_rf.index, columns=missing_features)
X_val_ibd_rf = pd.concat([X_val_ibd_rf, missing_df_rf], axis=1)

# Ensure the columns are in the same order as the training features
X_val_ibd_rf = X_val_ibd_rf[selected_features_rf]
rf_model.fit(X_val_ibd_rf, y_encoded_ibd_rf)

# Get the predicted probabilities
y_pred_prob_rf_ibd = rf_model.predict_proba(X_val_ibd_rf)[:, 1]

# Predict the class labels
y_pred_rf_ibd = rf_model.predict(X_val_ibd_rf)

# Calculate evaluation metrics
roc_auc_val_rf = roc_auc_score(y_encoded_ibd_rf, y_pred_prob_rf_ibd)

# Calculate accuracy, precision, recall, f1-scores
accuracy_val_rf = accuracy_score(y_encoded_ibd_rf, y_pred_rf_ibd)
precision_val_rf = precision_score(y_encoded_ibd_rf, y_pred_rf_ibd, average="weighted", zero_division=1)
recall_val_rf = recall_score(y_encoded_ibd_rf, y_pred_rf_ibd, average="weighted")
f1_val_rf = f1_score(y_encoded_ibd_rf, y_pred_rf_ibd, average="weighted")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_encoded_ibd_rf, y_pred_rf_ibd)

# Ensure confusion matrix has the correct dimensions for TN, FP, FN, TP
if conf_matrix.shape == (2, 2):
    tn_val_rf, fp_val_rf, fn_val_rf, tp_val_rf = conf_matrix.ravel()
    specificity_val_rf = tn_val_rf / (tn_val_rf + fp_val_rf)
else:
    specificity_val_rf = 'N/A'

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_rf:.2f}')
print(f'Validation Accuracy: {accuracy_val_rf:.2f}')
print(f'Validation Precision: {precision_val_rf:.2f}')
print(f'Validation Recall: {recall_val_rf:.2f}')
print(f'Validation F1-Score: {f1_val_rf:.2f}')
print(f'Validation Specificity: {specificity_val_rf:.2f}')


##95% CI

In [None]:
def bootstrap_confidence_interval(y_true, y_pred, y_pred_prob, metric_func, n_bootstraps=1000, alpha=0.95):
    bootstrapped_scores = []
    n_size = len(y_true)

    # Perform bootstrapping
    for i in range(n_bootstraps):
        # Sample with replacement from the data
        indices = resample(np.arange(n_size), replace=True, n_samples=n_size)
        if metric_func == roc_auc_score:
            score = metric_func(y_true[indices], y_pred_prob[indices])
        else:
            score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    # Calculate confidence intervals
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

# Number of bootstrap samples and alpha for 95% CI
n_bootstraps = 1000
alpha = 0.95

# Calculate 95% confidence intervals for metrics
roc_auc_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, roc_auc_score, n_bootstraps, alpha)
accuracy_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, accuracy_score, n_bootstraps, alpha)
precision_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, precision_score, n_bootstraps, alpha)
recall_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, recall_score, n_bootstraps, alpha)
f1_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, f1_score, n_bootstraps, alpha)

# Specificity calculation
def calculate_specificity(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        return tn / (tn + fp)
    else:
        return None

specificity_ci = bootstrap_confidence_interval(y_encoded_ibd_rf, y_pred_rf_ibd, y_pred_prob_rf_ibd, calculate_specificity, n_bootstraps, alpha)

# Print metrics with 95% confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_rf:.2f} (95% CI: {roc_auc_ci[0]:.2f} - {roc_auc_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_rf:.2f} (95% CI: {accuracy_ci[0]:.2f} - {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_rf:.2f} (95% CI: {precision_ci[0]:.2f} - {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_rf:.2f} (95% CI: {recall_ci[0]:.2f} - {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_rf:.2f} (95% CI: {f1_ci[0]:.2f} - {f1_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_rf:.2f} (95% CI: {specificity_ci[0]:.2f} - {specificity_ci[1]:.2f})')


##Predictions on CRC

In [None]:
X_val_cc_rf = cc.drop('Group', axis=1)
y_val_cc_rf = cc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded_cc_rf = label_encoder.fit_transform(y_val_cc_rf)

# Identify and add missing features in the validation set
missing_features = [feature for feature in selected_features_rf if feature not in X_val_cc_rf.columns]
missing_df_rf = pd.DataFrame(0, index=X_val_cc_rf.index, columns=missing_features)
X_val_cc_rf = pd.concat([X_val_cc_rf, missing_df_rf], axis=1)
X_val_cc_rf = X_val_cc_rf[selected_features_rf]


#Fit the model
rf_model.fit(X_val_cc_rf, y_encoded_cc_rf)
y_pred_prob_rf_cc = rf_model.predict_proba(X_val_cc_rf)[:, 1]
y_pred_rf_cc = rf_model.predict(X_val_cc_rf)

# Performance metrics
roc_auc_val_rf = roc_auc_score(y_encoded_cc_rf, y_pred_prob_rf_cc)
accuracy_val_rf = accuracy_score(y_encoded_cc_rf, y_pred_rf_cc)
precision_val_rf = precision_score(y_encoded_cc_rf, y_pred_rf_cc, average="weighted", zero_division=1)
recall_val_rf = recall_score(y_encoded_cc_rf, y_pred_rf_cc, average="weighted")
f1_val_rf = f1_score(y_encoded_cc_rf, y_pred_rf_cc, average="weighted")

# Specificity
conf_matrix = confusion_matrix(y_encoded_cc_rf, y_pred_rf_cc)
if conf_matrix.shape == (2, 2):
    tn_val_rf, fp_val_rf, fn_val_rf, tp_val_rf = conf_matrix.ravel()
    specificity_val_rf = tn_val_rf / (tn_val_rf + fp_val_rf)
else:
    specificity_val_rf = 'N/A'

# Print  evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_rf:.2f}')
print(f'Validation Accuracy: {accuracy_val_rf:.2f}')
print(f'Validation Precision: {precision_val_rf:.2f}')
print(f'Validation Recall: {recall_val_rf:.2f}')
print(f'Validation F1-Score: {f1_val_rf:.2f}')
print(f'Validation Specificity: {specificity_val_rf:.2f}')

##95% CI

In [None]:
def calculate_metrics(y_true, y_prob, y_pred):
    roc_auc = roc_auc_score(y_true, y_prob)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=1)
    recall = recall_score(y_true, y_pred, average="weighted")
    f1 = f1_score(y_true, y_pred, average="weighted")
    return roc_auc, accuracy, precision, recall, f1

# Bootstrapping for 95% CI
n_bootstraps = 1000
rng = np.random.default_rng(seed=42)

metrics_bootstrapped = []
for _ in range(n_bootstraps):
    indices = rng.choice(range(len(y_encoded_cc_rf)), size=len(y_encoded_cc_rf), replace=True)
    X_resampled = X_val_cc_rf.iloc[indices]
    y_true_resampled = y_encoded_cc_rf[indices]

    # Predictions on the bootstrap sample
    y_prob_resampled = rf_model.predict_proba(X_resampled)[:, 1]
    y_pred_resampled = rf_model.predict(X_resampled)

    # Calculate metrics for the bootstrap sample
    metrics_bootstrapped.append(calculate_metrics(y_true_resampled, y_prob_resampled, y_pred_resampled))

# Convert bootstrapped metrics to NumPy array for easier calculations
metrics_bootstrapped = np.array(metrics_bootstrapped)

# Calculate 2.5th and 97.5th percentiles for each metric
ci_lower = np.percentile(metrics_bootstrapped, 2.5, axis=0)
ci_upper = np.percentile(metrics_bootstrapped, 97.5, axis=0)

# Print metrics with 95% CIs
print(f'Validation ROC AUC: {roc_auc_val_rf:.2f} (95% CI: {ci_lower[0]:.2f}-{ci_upper[0]:.2f})')
print(f'Validation Accuracy: {accuracy_val_rf:.2f} (95% CI: {ci_lower[1]:.2f}-{ci_upper[1]:.2f})')
print(f'Validation Precision: {precision_val_rf:.2f} (95% CI: {ci_lower[2]:.2f}-{ci_upper[2]:.2f})')
print(f'Validation Recall: {recall_val_rf:.2f} (95% CI: {ci_lower[3]:.2f}-{ci_upper[3]:.2f})')
print(f'Validation F1-Score: {f1_val_rf:.2f} (95% CI: {ci_lower[4]:.2f}-{ci_upper[4]:.2f})')

# Function to calculate specificity
def calculate_specificity(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    else:
        specificity = 'N/A'
    return specificity

# Bootstrapping for 95% CI of specificity
specificity_bootstrapped = []
for _ in range(n_bootstraps):
    # Resample the validation dataset with replacement
    indices = rng.choice(range(len(y_encoded_cc_rf)), size=len(y_encoded_cc_rf), replace=True)
    X_resampled = X_val_cc_rf.iloc[indices]
    y_true_resampled = y_encoded_cc_rf[indices]

    # Predictions on the bootstrap sample
    y_pred_resampled = rf_model.predict(X_resampled)

    # Calculate specificity for the bootstrap sample
    specificity_bootstrapped.append(calculate_specificity(y_true_resampled, y_pred_resampled))

# Convert to NumPy array for percentile calculations
specificity_bootstrapped = np.array(specificity_bootstrapped)

# Calculate 2.5th and 97.5th percentiles for specificity
specificity_ci_lower = np.percentile(specificity_bootstrapped, 2.5)
specificity_ci_upper = np.percentile(specificity_bootstrapped, 97.5)

# Print specificity with 95% CI
print(f'Validation Specificity: {specificity_val_rf:.2f} (95% CI: {specificity_ci_lower:.2f}-{specificity_ci_upper:.2f})')

#LASSO

In [None]:
X = gc.drop(['Group'], axis=1)
y = gc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create Logistic Regression classifier with L1 regularization
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Train the classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear'],
    'max_iter': [1000, 1500, 2000],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', random_state=42)

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    log_reg, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the Randomized Search model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')



##Bayesian Optimization

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the best parameters found from previous searches lasso
best_params_random = {
    'C': 78.47599703514607,
    'max_iter': 1500,
    'tol': 0.0001,
    'solver': 'liblinear'
}

def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    max_iter = trial.suggest_int('max_iter', 2000, 5000)
    tol = trial.suggest_float('tol', 1e-4, 1e-2, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Initialize LogisticRegression with hyperparameters
    clf = LogisticRegression(
        penalty='l1',
        C=C,
        max_iter=max_iter,
        tol=tol,
        solver=solver,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()
    return score

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=42)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = LogisticRegression(penalty='l1', **best_params_optuna, random_state=42)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")



In [None]:
selected_features_lasso = X_train.columns.tolist()

# Best parameters from Bayesian Optimization with Optuna
best_params_lasso = {
    'C':  14.007388155394787,
    'max_iter': 3223,
    'solver': 'saga',
    'tol': 0.0008450953267578447
}

# Train the Logistic Regression model again using only the selected features
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_lasso, X_train[selected_features_lasso], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

final_model_lasso.fit(X_train[selected_features_lasso], y_train)

# Make predictions on the test set (predicted probabilities)
y_pred_proba_lasso = final_model_lasso.predict_proba(X_test[selected_features_lasso])[:, 1]

# Convert probabilities to predicted class labels
y_pred_lasso = final_model_lasso.predict(X_test[selected_features_lasso])

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_lasso)
precision = precision_score(y_test, y_pred_lasso)
recall = recall_score(y_test, y_pred_lasso)
f1 = f1_score(y_test, y_pred_lasso)
roc_auc = roc_auc_score(y_test, y_pred_proba_lasso)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')

##95% CI

In [None]:
# Number of bootstrap samples
n_bootstraps = 1000
rng = np.random.RandomState(42)

# Initialize lists to store bootstrap results
accuracy_bootstrap = []
precision_bootstrap = []
recall_bootstrap = []
f1_bootstrap = []
specificity_bootstrap = []
roc_auc_bootstrap = []

# Perform bootstrapping
for i in range(n_bootstraps):
    # Resample the dataset with replacement
    indices = resample(np.arange(len(y_test)), random_state=rng)
    y_true_resampled = y_test[indices]
    X_test_resampled = X_test.iloc[indices]
    y_pred_resampled = final_model_lasso.predict(X_test_resampled[selected_features_lasso])
    y_pred_proba_resampled = final_model_lasso.predict_proba(X_test_resampled[selected_features_lasso])[:, 1]

    # Calculate metrics on the resampled data
    accuracy_bootstrap.append(accuracy_score(y_true_resampled, y_pred_resampled))
    precision_bootstrap.append(precision_score(y_true_resampled, y_pred_resampled, zero_division=1))
    recall_bootstrap.append(recall_score(y_true_resampled, y_pred_resampled, zero_division=1))
    f1_bootstrap.append(f1_score(y_true_resampled, y_pred_resampled, zero_division=1))
    roc_auc_bootstrap.append(roc_auc_score(y_true_resampled, y_pred_proba_resampled))

    tn, fp, fn, tp = confusion_matrix(y_true_resampled, y_pred_resampled).ravel()
    specificity_bootstrap.append(tn / (tn + fp))

# Calculate 95% confidence intervals
def ci(metric_values):
    return np.percentile(metric_values, [2.5, 97.5])

accuracy_ci = ci(accuracy_bootstrap)
precision_ci = ci(precision_bootstrap)
recall_ci = ci(recall_bootstrap)
f1_ci = ci(f1_bootstrap)
specificity_ci = ci(specificity_bootstrap)
roc_auc_ci = ci(roc_auc_bootstrap)

# Print evaluation metrics and their 95% confidence intervals
print(f'Test ROC AUC: {roc_auc:.2f} (95% CI: {roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f})')
print(f'Test Accuracy: {accuracy:.2f} (95% CI: {accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f})')
print(f'Test Precision: {precision:.2f} (95% CI: {precision_ci[0]:.2f}, {precision_ci[1]:.2f})')
print(f'Test Recall: {recall:.2f} (95% CI: {recall_ci[0]:.2f}, {recall_ci[1]:.2f})')
print(f'Test F1-Score: {f1:.2f} (95% CI: {f1_ci[0]:.2f}, {f1_ci[1]:.2f})')
print(f'Test Specificity: {specificity:.2f} (95% CI: {specificity_ci[0]:.2f}, {specificity_ci[1]:.2f})')


##Predictions on IBD

In [None]:
X_val_ibd_lasso = ibd.drop('Group', axis=1)
y_val_ibd_lasso = ibd['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded_ibd_lasso = label_encoder.fit_transform(y_val_ibd_lasso)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_lasso if feature not in X_val_ibd_lasso.columns]

# Add the missing features to the validation set with zero values using pd.concat
missing_df_lasso = pd.DataFrame(0, index=X_val_ibd_lasso.index, columns=missing_features)
X_val_ibd_lasso = pd.concat([X_val_ibd_lasso, missing_df_lasso], axis=1)

# Ensure the columns are in the same order as the training features
X_val_ibd_lasso = X_val_ibd_lasso[selected_features_lasso]
lasso_model.fit(X_val_ibd_lasso, y_encoded_ibd_lasso)

# Get the predicted probabilities
y_pred_prob_lasso_ibd = lasso_model.predict_proba(X_val_ibd_lasso)[:, 1]

# Predict the class labels
y_pred_lasso_ibd = lasso_model.predict(X_val_ibd_lasso)

# Calculate evaluation metrics
roc_auc_val_lasso = roc_auc_score(y_encoded_ibd_lasso, y_pred_prob_lasso_ibd)

# Calculate accuracy, precision, recall, f1-score
accuracy_val_lasso = accuracy_score(y_encoded_ibd_lasso, y_pred_lasso_ibd)
precision_val_lasso = precision_score(y_encoded_ibd_lasso, y_pred_lasso_ibd, average="weighted", zero_division=1)
recall_val_lasso = recall_score(y_encoded_ibd_lasso, y_pred_lasso_ibd, average="weighted")
f1_val_lasso = f1_score(y_encoded_ibd_lasso, y_pred_lasso_ibd, average="weighted")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_encoded_ibd_lasso, y_pred_lasso_ibd)

# Ensure confusion matrix has the correct dimensions for TN, FP, FN, TP
if conf_matrix.shape == (2, 2):
    tn_val_lasso, fp_val_lasso, fn_val_lasso, tp_val_lasso = conf_matrix.ravel()
    specificity_val_lasso = tn_val_lasso / (tn_val_lasso + fp_val_lasso)
else:
    specificity_val_lasso = 'N/A'

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_lasso:.2f}')
print(f'Validation Accuracy: {accuracy_val_lasso:.2f}')
print(f'Validation Precision: {precision_val_lasso:.2f}')
print(f'Validation Recall: {recall_val_lasso:.2f}')
print(f'Validation F1-Score: {f1_val_lasso:.2f}')
print(f'Validation Specificity: {specificity_val_lasso:.2f}')


##95% CI

In [None]:

def bootstrap_confidence_interval(y_true, y_pred, y_pred_prob, metric_func, n_bootstraps=1000, alpha=0.95):
    bootstrapped_scores = []
    n_size = len(y_true)

    # Perform bootstrapping
    for i in range(n_bootstraps):
        # Sample with replacement from the data
        indices = resample(np.arange(n_size), replace=True, n_samples=n_size)
        if metric_func == roc_auc_score:
            score = metric_func(y_true[indices], y_pred_prob[indices])
        else:
            score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    # Calculate confidence intervals
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

# Number of bootstrap samples and alpha for 95% CI
n_bootstraps = 1000
alpha = 0.95

# Calculate 95% confidence intervals for metrics
roc_auc_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, roc_auc_score, n_bootstraps, alpha)
accuracy_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, accuracy_score, n_bootstraps, alpha)
precision_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, precision_score, n_bootstraps, alpha)
recall_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, recall_score, n_bootstraps, alpha)
f1_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, f1_score, n_bootstraps, alpha)

# For specificity, handle separately based on the confusion matrix
def calculate_specificity(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        return tn / (tn + fp)
    else:
        return None

specificity_ci = bootstrap_confidence_interval(y_encoded_ibd_lasso, y_pred_lasso_ibd, y_pred_prob_lasso_ibd, calculate_specificity, n_bootstraps, alpha)

# Print metrics with 95% confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_lasso:.2f} (95% CI: {roc_auc_ci[0]:.2f} - {roc_auc_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_lasso:.2f} (95% CI: {accuracy_ci[0]:.2f} - {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_lasso:.2f} (95% CI: {precision_ci[0]:.2f} - {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_lasso:.2f} (95% CI: {recall_ci[0]:.2f} - {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_lasso:.2f} (95% CI: {f1_ci[0]:.2f} - {f1_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_lasso:.2f} (95% CI: {specificity_ci[0]:.2f} - {specificity_ci[1]:.2f})')


##Predictions on CRC

In [None]:
X_val_cc_lasso = cc.drop('Group', axis=1)
y_val_cc_lasso = cc['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded_cc_lasso = label_encoder.fit_transform(y_val_cc_lasso)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_lasso if feature not in X_val_cc_lasso.columns]

# Add the missing features to the validation set with zero values using pd.concat
missing_df_lasso = pd.DataFrame(0, index=X_val_cc_lasso.index, columns=missing_features)
X_val_cc_lasso = pd.concat([X_val_cc_lasso, missing_df_lasso], axis=1)

# Ensure the columns are in the same order as the training features
X_val_cc_lasso = X_val_cc_lasso[selected_features_lasso]
lasso_model.fit(X_val_cc_lasso, y_encoded_cc_lasso)

# Get the predicted probabilities
y_pred_prob_lasso_cc = lasso_model.predict_proba(X_val_cc_lasso)[:, 1]

# Predict the class labels
y_pred_lasso_cc = lasso_model.predict(X_val_cc_lasso)

# Calculate evaluation metrics
roc_auc_val_lasso = roc_auc_score(y_encoded_cc_lasso, y_pred_prob_lasso_cc)

# Calculate accuracy, precision, recall, f1-score
accuracy_val_lasso = accuracy_score(y_encoded_cc_lasso, y_pred_lasso_cc)
precision_val_lasso = precision_score(y_encoded_cc_lasso, y_pred_lasso_cc, average="weighted", zero_division=1)
recall_val_lasso = recall_score(y_encoded_cc_lasso, y_pred_lasso_cc, average="weighted")
f1_val_lasso = f1_score(y_encoded_cc_lasso, y_pred_lasso_cc, average="weighted")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_encoded_cc_lasso, y_pred_lasso_cc)

# Ensure confusion matrix has the correct dimensions for TN, FP, FN, TP
if conf_matrix.shape == (2, 2):
    tn_val_lasso, fp_val_lasso, fn_val_lasso, tp_val_lasso = conf_matrix.ravel()
    specificity_val_lasso = tn_val_lasso / (tn_val_lasso + fp_val_lasso)
else:
    specificity_val_lasso = 'N/A'

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_lasso:.2f}')
print(f'Validation Accuracy: {accuracy_val_lasso:.2f}')
print(f'Validation Precision: {precision_val_lasso:.2f}')
print(f'Validation Recall: {recall_val_lasso:.2f}')
print(f'Validation F1-Score: {f1_val_lasso:.2f}')
print(f'Validation Specificity: {specificity_val_lasso:.2f}')

##95% CI

In [None]:
def bootstrap_confidence_interval(y_true, y_pred, y_pred_prob, metric_func, n_bootstraps=1000, alpha=0.95):
    bootstrapped_scores = []
    n_size = len(y_true)

    # Perform bootstrapping
    for i in range(n_bootstraps):
        indices = resample(np.arange(n_size), replace=True, n_samples=n_size)
        if metric_func == roc_auc_score:
            score = metric_func(y_true[indices], y_pred_prob[indices])
        else:
            score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    # Calculate confidence intervals
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

# Number of bootstrap samples and alpha for 95% CI
n_bootstraps = 1000
alpha = 0.95

# Calculate 95% confidence intervals for metrics
roc_auc_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, roc_auc_score, n_bootstraps, alpha)
accuracy_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, accuracy_score, n_bootstraps, alpha)
precision_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, precision_score, n_bootstraps, alpha)
recall_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, recall_score, n_bootstraps, alpha)
f1_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, f1_score, n_bootstraps, alpha)

# For specificity, handle separately based on the confusion matrix
def calculate_specificity(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        return tn / (tn + fp)
    else:
        return None

specificity_ci = bootstrap_confidence_interval(y_encoded_cc_lasso, y_pred_lasso_cc, y_pred_prob_lasso_cc, calculate_specificity, n_bootstraps, alpha)

# Print metrics with 95% confidence intervals
print(f'Validation ROC AUC: {roc_auc_val_lasso:.2f} (95% CI: {roc_auc_ci[0]:.2f} - {roc_auc_ci[1]:.2f})')
print(f'Validation Accuracy: {accuracy_val_lasso:.2f} (95% CI: {accuracy_ci[0]:.2f} - {accuracy_ci[1]:.2f})')
print(f'Validation Precision: {precision_val_lasso:.2f} (95% CI: {precision_ci[0]:.2f} - {precision_ci[1]:.2f})')
print(f'Validation Recall: {recall_val_lasso:.2f} (95% CI: {recall_ci[0]:.2f} - {recall_ci[1]:.2f})')
print(f'Validation F1-Score: {f1_val_lasso:.2f} (95% CI: {f1_ci[0]:.2f} - {f1_ci[1]:.2f})')
print(f'Validation Specificity: {specificity_val_lasso:.2f} (95% CI: {specificity_ci[0]:.2f} - {specificity_ci[1]:.2f})')
