###Import necessary libraries

In [None]:
! pip install optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,os.path
import re
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from scipy.stats import bootstrap

###Load the datasets

In [None]:
mtb = pd.read_csv('mtb_scaled_ibd.csv')
mtb

#XGBOOST

In [None]:
X = mtb.drop(['Study.Group'], axis=1)
y = mtb['Study.Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost classifier
model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the original test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test F1 Score: {f1:.4f}')
print(f'Test Recall: {recall:.4f}')


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

param_dist = {
    'n_estimators': np.arange(100, 500, 50),
    'max_depth': np.arange(4, 10),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 5),
    'min_child_weight': np.arange(1, 6),
    'alpha': np.logspace(-3, 1, 5),
    'lambda': np.logspace(-3, 1, 5)
}


random_search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_dist, n_iter=200,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

# Initialize the XGBoost classifier with early stopping
xgb_best = XGBClassifier(**best_params, random_state=42,
                         early_stopping_rounds=10,
                         eval_metric='logloss')

# Fit the model on the full training set
xgb_best.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# Predict on the test set
y_pred = xgb_best.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-validation Accuracy: {random_search.best_score_:.4f}")

##Bayesian Optimization

In [None]:
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # Adjusting the range to include 400 as the maximum
        'n_estimators': trial.suggest_int('n_estimators', 50, 400)
    }

    # Initialize the XGBoost model with the suggested hyperparameters
    model = xgb.XGBClassifier(**params, eval_metric='logloss')

    # Use StratifiedKFold to maintain the class distribution
    skf = StratifiedKFold(n_splits=5)

    # Evaluate using cross-validation on the data
    cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    # Return the mean AUC-ROC from cross-validation
    return cv_scores.mean()

# Create a study to maximize AUC-ROC
study = optuna.create_study(direction='maximize')

# Enqueue the parameters obtained from previous RandomizedSearchCV results
study.enqueue_trial({
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'learning_rate': 0.2,
    'gamma':  0.0,
    'max_depth': 8,
    'min_child_weight': 1,
    'subsample': 0.6111111111111112,
    'colsample_bytree': 0.7777777777777778,
    'n_estimators': 400,
    'lambda': 0.01,
    'alpha': 0.001
})

# Optimize the study using 50 trials
study.optimize(objective, n_trials=50)

# Print the best parameters and cross-validation AUC-ROC
print(f"Best Parameters: {study.best_params}")
print(f"Best Cross-validation AUC-ROC: {study.best_value:.4f}")

# Train the final model with the best parameters on the training data
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params, eval_metric='logloss')
final_model.fit(X_train, y_train)

# Make predictions on the original test set
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

# Evaluate the final model on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print test set performance metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test AUC-ROC: {roc_auc:.4f}")


In [None]:
# Best parameters from Bayesian Optimization XGBoost
best_params_xg = {
    'learning_rate': 0.17008256693104395,
    'max_depth': 15,
    'n_estimators': 178,
    'gamma': 0.0030531517878193103,
    'min_child_weight': 0.5024232638699605,
    'subsample': 0.5288178444987158,
    'colsample_bytree': 0.9423605692686023,
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'lambda': 2.5159211654207813e-06,
    'alpha': 3.018073013444745e-07

}

# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params_xg)

# Perform cross-validation using the training data
cv_scores = cross_val_score(final_model_xg, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model using the training data
final_model_xg.fit(X_train, y_train)

# Make predictions on the original test set
y_pred = final_model_xg.predict(X_test)

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')


##Feature Selection For XGBOOST(RFECV)

In [None]:
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X, y, test_size=0.25, random_state=42)
final_model_xg = xgb.XGBClassifier(**best_params_xg)

# Define RFECV
selector = RFECV(estimator=final_model_xg, step=20, cv=StratifiedKFold(10), scoring='roc_auc', verbose=2)

# Fit the RFECV selector on the training data
selector.fit(X_train_xg, y_train_xg)

# Print the optimal number of features
print(f"Optimal number of features: {selector.n_features_}")

# Get the names of the selected features
selected_features_mask = selector.support_
selected_features_xg = X.columns[selected_features_mask]

# Print the selected features
print("Selected features:", selected_features_xg)

# Train the final model with selected features on the training set
final_model_xg.fit(X_train_xg[selected_features_xg], y_train_xg)

# Make predictions on the test set using only the selected features
y_pred = final_model_xg.predict(X_test_xg[selected_features_xg])

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_xg, y_pred)
precision = precision_score(y_test_xg, y_pred)
recall = recall_score(y_test_xg, y_pred)
f1 = f1_score(y_test_xg, y_pred)

# Print the results
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')

In [None]:
X_train_selected_xg = selector.transform(X_train_xg)
X_test_selected_xg = selector.transform(X_test)

# Train the final model with the selected features on the training set
final_model_xg.fit(X_train_selected_xg, y_train_xg)

# Make predictions on the test set
y_pred_prob_xg = final_model_xg.predict_proba(X_test_selected_xg)[:, 1]
y_pred_xg = final_model_xg.predict(X_test_selected_xg)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_xg, y_pred_xg)
precision = precision_score(y_test_xg, y_pred_xg)
recall = recall_score(y_test_xg, y_pred_xg)
f1 = f1_score(y_test_xg, y_pred_xg)
roc_auc = roc_auc_score(y_test_xg, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test_xg, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')

##95% CI

In [None]:
# Function to calculate evaluation metrics
def calculate_metrics(y_test, y_pred, y_pred_prob):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)

    return accuracy, precision, recall, f1, roc_auc, specificity

# Function to compute 95% confidence intervals using bootstrapping
def bootstrap_ci(y_test, y_pred, y_pred_prob, n_bootstraps=1000, ci=95):
    bootstrapped_scores = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'roc_auc': [],
        'specificity': []
    }

    for i in range(n_bootstraps):
        # Resample the test data
        indices = resample(np.arange(len(y_test)), random_state=i)
        y_test_resampled = y_test[indices]
        y_pred_resampled = y_pred[indices]
        y_pred_prob_resampled = y_pred_prob[indices]

        # Calculate metrics for the resampled data
        accuracy, precision, recall, f1, roc_auc, specificity = calculate_metrics(
            y_test_resampled, y_pred_resampled, y_pred_prob_resampled
        )

        # Store the scores
        bootstrapped_scores['accuracy'].append(accuracy)
        bootstrapped_scores['precision'].append(precision)
        bootstrapped_scores['recall'].append(recall)
        bootstrapped_scores['f1'].append(f1)
        bootstrapped_scores['roc_auc'].append(roc_auc)
        bootstrapped_scores['specificity'].append(specificity)

    # Calculate percentiles for confidence intervals
    ci_lower = (100 - ci) / 2
    ci_upper = 100 - ci_lower

    metrics_ci = {
        metric: (np.percentile(scores, ci_lower), np.percentile(scores, ci_upper))
        for metric, scores in bootstrapped_scores.items()
    }

    return metrics_ci

# After training your model and getting predictions
accuracy, precision, recall, f1, roc_auc, specificity = calculate_metrics(y_test, y_pred_xg, y_pred_prob_xg)

# Compute the 95% confidence intervals
metrics_ci = bootstrap_ci(np.array(y_test), np.array(y_pred_xg), np.array(y_pred_prob_xg))

# Print the metrics and their confidence intervals
print(f'Test ROC AUC: {roc_auc:.2f} (95% CI: {metrics_ci["roc_auc"][0]:.2f}, {metrics_ci["roc_auc"][1]:.2f})')
print(f'Test Accuracy: {accuracy:.2f} (95% CI: {metrics_ci["accuracy"][0]:.2f}, {metrics_ci["accuracy"][1]:.2f})')
print(f'Test Precision: {precision:.2f} (95% CI: {metrics_ci["precision"][0]:.2f}, {metrics_ci["precision"][1]:.2f})')
print(f'Test Recall: {recall:.2f} (95% CI: {metrics_ci["recall"][0]:.2f}, {metrics_ci["recall"][1]:.2f})')
print(f'Test F1-Score: {f1:.2f} (95% CI: {metrics_ci["f1"][0]:.2f}, {metrics_ci["f1"][1]:.2f})')
print(f'Test Specificity: {specificity:.2f} (95% CI: {metrics_ci["specificity"][0]:.2f}, {metrics_ci["specificity"][1]:.2f})')


#RANDOM FOREST

In [None]:
X = mtb.drop(['Study.Group'], axis=1)
y = mtb['Study.Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Calculate F1 score on the test set
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Test F1 Score: {test_f1:.2f}")

# Calculate precision on the test set
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f"Test Precision: {test_precision:.2f}")

# Calculate recall on the test set
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f"Test Recall: {test_recall:.2f}")


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'rf__n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__max_depth': [int(x) for x in np.linspace(10, 300, num=20)] + [None],
    'rf__min_samples_split': [2, 5, 10, 15],
    'rf__min_samples_leaf': [1, 2, 4, 6],
    'rf__bootstrap': [True, False]
}

# Initialize the pipeline:Random Forest
pipeline = Pipeline([
    ('rf', RandomForestClassifier(random_state=42))
])

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist,
                               n_iter=100, cv=StratifiedKFold(5), verbose=2,
                               random_state=42, n_jobs=-1, scoring='roc_auc')

# Fit RandomizedSearchCV to the original training data
rf_random.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(rf_random.best_params_)

# Predict on the original test data
y_pred = rf_random.best_estimator_.predict(X_test)
y_prob = rf_random.best_estimator_.predict_proba(X_test)[:, 1]

# Evaluate the model with default threshold
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_prob)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")


##Bayesian Optimization

In [None]:
# Define the best parameters found from RandomizedSearchCV RF
best_params_rf = {
    'n_estimators': 800,
    'min_samples_split': 10,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 193,
    'bootstrap': False,
    'class_weight': 'balanced'
}

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', max(100, best_params_rf['n_estimators'] - 200), best_params_rf['n_estimators'] + 200)
    min_samples_split = trial.suggest_int('min_samples_split', max(2, best_params_rf['min_samples_split'] - 3), best_params_rf['min_samples_split'] + 3)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', max(1, best_params_rf['min_samples_leaf'] - 2), best_params_rf['min_samples_leaf'] + 2)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 50, 86, 100, 193, 300])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0, 0.01)

    # Initialize RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_depth=max_depth,
        bootstrap=bootstrap,
        class_weight=class_weight,
        criterion=criterion,
        min_impurity_decrease=min_impurity_decrease,
        random_state=42
    )

    cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(cv_scores)


# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')

# Enqueue the trial with the best parameters from RandomizedSearchCV
study.enqueue_trial(best_params_rf)

study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)


# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = RandomForestClassifier(**best_params_optuna, random_state=50)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
y_prob = best_clf.predict_proba(X_test)[:, 1]

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")


In [None]:
# Best parameters from Bayesian Optimization RF
best_params_rf = {
    'n_estimators': 661,
    'min_samples_split': 8,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'bootstrap': False,
    'criterion': 'gini',
    'max_depth': 100
}

# Create the Random Forest classifier with the best parameters
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=50)

# Perform cross-validation with 5 folds on the training data
cv_scores = cross_val_score(final_model_rf, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the final model on the training data
final_model_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model_rf.predict(X_test)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
y_prob = best_clf.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results for test data
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')


##Feature Selection RF (RFECV)

In [None]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.25, random_state=50)
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=50)

# Define the RFECV selector
selector_rf = RFECV(estimator=final_model_rf, step=9, cv=StratifiedKFold(10), scoring='roc_auc', verbose=2)

# Fit the RFECV selector on the training data
selector_rf.fit(X_train_rf, y_train_rf)

# Print the optimal number of features
print(f"Optimal number of features: {selector_rf.n_features_}")

# Get the selected features
selected_features_mask_rf = selector_rf.support_
selected_features_rf = X.columns[selected_features_mask_rf]

# Print the selected features
print("Selected features:", selected_features_rf)

# Create a new DataFrame with the selected features
RandomForest_gene = X[selected_features_rf]


In [None]:
# Create a new DataFrame with the selected features for training and testing sets
X_train_selected_rf = X_train_rf[selected_features_rf]
X_test_selected_rf = X_test_rf[selected_features_rf]


# Train the final Random Forest model with the selected features
final_model_rf.fit(X_train_selected_rf, y_train_rf)

# Perform cross-validation on the training set
cv_scores = cross_val_score(final_model_rf, X_train_selected_rf, y_train_rf, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Make predictions on the test set using the original selected features
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
y_pred_rf = final_model_rf.predict(X_test_selected_rf)

# Calculate evaluation metrics
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
precision_rf = precision_score(y_test_rf, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test_rf, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test_rf, y_pred_rf, average='weighted')
roc_auc_rf = roc_auc_score(y_test_rf, y_pred_prob_rf, average='weighted')

# Calculate confusion matrix
cm = confusion_matrix(y_test_rf, y_pred_rf)
print("Confusion Matrix:\n", cm)

# Calculate specificity
if cm.shape == (2, 2):
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    print(f'Test Specificity: {specificity:.2f}')

# Print evaluation metrics
print(f'Random Forest Test ROC AUC: {roc_auc_rf:.2f}')
print(f'Random Forest Test Accuracy: {accuracy_rf:.2f}')
print(f'Random Forest Test Precision: {precision_rf:.2f}')
print(f'Random Forest Test Recall: {recall_rf:.2f}')
print(f'Random Forest Test F1-Score: {f1_rf:.2f}')


##95% CI

In [None]:
def bootstrap_ci(metric_func, y_true, y_pred, y_pred_prob=None, n_bootstraps=1000, alpha=0.05, **kwargs):
    bootstrapped_scores = []
    for i in range(n_bootstraps):
        indices = resample(np.arange(len(y_true)), replace=True)
        if y_pred_prob is None:
            score = metric_func(y_true[indices], y_pred[indices], **kwargs)
        else:
            score = metric_func(y_true[indices], y_pred_prob[indices], **kwargs)
        bootstrapped_scores.append(score)
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, 100 * (alpha / 2))
    upper_bound = np.percentile(sorted_scores, 100 * (1 - alpha / 2))
    return lower_bound, upper_bound

# Bootstrap 95% confidence intervals for each metric, including 'average' parameter where necessary
accuracy_ci = bootstrap_ci(accuracy_score, y_test_rf, y_pred_rf)
precision_ci = bootstrap_ci(precision_score, y_test_rf, y_pred_rf, average='weighted')
recall_ci = bootstrap_ci(recall_score, y_test_rf, y_pred_rf, average='weighted')
f1_ci = bootstrap_ci(f1_score, y_test_rf, y_pred_rf, average='weighted')
roc_auc_ci = bootstrap_ci(roc_auc_score, y_test_rf, y_pred_rf, y_pred_prob=y_pred_prob_rf)
specificity_ci = proportion_confint(tn, tn + fp, alpha=0.05, method='normal')

# Print evaluation metrics with their 95% confidence intervals
print(f'Random Forest Test ROC AUC: {roc_auc_rf:.2f} (95% CI: [{roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f}])')
print(f'Random Forest Test Accuracy: {accuracy_rf:.2f} (95% CI: [{accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f}])')
print(f'Random Forest Test Precision: {precision_rf:.2f} (95% CI: [{precision_ci[0]:.2f}, {precision_ci[1]:.2f}])')
print(f'Random Forest Test Recall: {recall_rf:.2f} (95% CI: [{recall_ci[0]:.2f}, {recall_ci[1]:.2f}])')
print(f'Random Forest Test F1-Score: {f1_rf:.2f} (95% CI: [{f1_ci[0]:.2f}, {f1_ci[1]:.2f}])')
print(f'Random Forest Test Specificity: {specificity:.2f} (95% CI: [{specificity_ci[0]:.2f}, {specificity_ci[1]:.2f}])')


#LASSO

In [None]:
X = mtb.drop(['Study.Group'], axis=1)
y = mtb['Study.Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Train the classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear'],
    'max_iter': [1000, 5000, 10000, 20000],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

# Create Logistic Regression classifier with L1 regularization
log_reg = LogisticRegression(penalty='l1', random_state=42)

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    log_reg, param_distributions=param_dist, n_iter=100,
    scoring='accuracy', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the Randomized Search model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Bayesian Optimization

In [None]:
# Adjust Random Search best parameters
random_search_params = {'tol': 0.0001, 'solver': 'liblinear', 'max_iter': 20000, 'C':1438.44988828766}

def objective(trial):
    # Expand the upper bound of C to accommodate the value from the random search
    C = trial.suggest_float('C', 1e-4, 1e4, log=True)  # Increased upper limit
    max_iter = trial.suggest_int('max_iter', 1000, 50000)
    tol = trial.suggest_float('tol', 1e-4, 1e-2, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Create a pipeline with Logistic Regression
    clf = make_pipeline(
        LogisticRegression(
            penalty='l1', C=C, max_iter=max_iter, tol=tol, solver=solver, random_state=42
        )
    )

    # Cross-validation to compute AUC-ROC score
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()

    return score


# Perform optimization with Optuna, initializing with Random Search parameters
study = optuna.create_study(direction='maximize')

# Set the Random Search best parameters as the first trial
def random_search_trial(trial):
    trial.suggest_float('C', 100.0, 100.0)
    trial.suggest_int('max_iter', 20000, 20000)
    trial.suggest_float('tol', 0.0001, 0.0001)
    trial.suggest_categorical('solver', ['liblinear'])

study.enqueue_trial(random_search_params)
study.optimize(objective, n_trials=50)

# Print the best parameters and best AUC-ROC score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best AUC-ROC Score from Optuna:", study.best_value)

# Retrieve the best parameters and train the model on the balanced data
best_params = study.best_params

# Pass the best parameters to LogisticRegression
clf = LogisticRegression(
    penalty='l1',
    C=best_params['C'],
    max_iter=best_params['max_iter'],
    tol=best_params['tol'],
    solver=best_params['solver'],
    random_state=42
)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred_prob = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

# Evaluate the model on the test set
roc_auc = roc_auc_score(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Test AUC-ROC: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")



In [None]:
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(X, y, test_size=0.25, random_state=42)

# Best parameters from Bayesian Optimization with Optuna
best_params_lasso = {
    'C': 512.0959297957417,
    'max_iter':30362,
    'tol': 0.0005183129895961714,
    'solver': 'liblinear'

}
# Use Lasso with class weights
lasso_model = LogisticRegression(penalty='l1', class_weight='balanced', **best_params_lasso, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(lasso_model, X_train_lasso, y_train_lasso, cv=5, scoring='roc_auc')
print("Cross-validation:", cv_scores)
print(f"Mean CV: {np.mean(cv_scores):.4f}")

# Train the Lasso model on the entire training data
lasso_model.fit(X_train_lasso, y_train_lasso)

# Make predictions on the test set
y_pred_lasso = lasso_model.predict(X_test_lasso)

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test_lasso, y_pred_lasso)
precision = precision_score(y_test_lasso, y_pred_lasso, average='weighted')
recall = recall_score(y_test_lasso, y_pred_lasso, average='weighted')
f1 = f1_score(y_test_lasso, y_pred_lasso, average='weighted')

# Print the results for test data
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')


##LASSO Feature Selection

In [None]:
lasso_model = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)

# Fit the final model on the entire training data
lasso_model.fit(X_train_lasso, y_train_lasso)

# Extracting feature names from original DataFrame
feature_names = X.columns

#Print out selected features based on non-zero coefficients
selected_features_lasso = feature_names[np.abs(lasso_model.coef_[0]) > 0]
selected_coefficients = lasso_model.coef_[0][np.abs(lasso_model.coef_[0]) > 0]

print("Selected features and coefficients:")
for feature, coef in zip(selected_features_lasso, selected_coefficients):
    print(f"{feature}: {coef:.4f}")

# Optionally, print the number of selected features
print(f"Number of selected features: {len(selected_features_lasso)}")

In [None]:
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_lasso, y_train_lasso)

# Make predictions on the test set (predicted probabilities)
y_pred_proba_lasso = final_model_lasso.predict_proba(X_test_lasso)[:, 1]

# Convert probabilities to predicted class labels
y_pred_lasso = final_model_lasso.predict(X_test_lasso)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_lasso, y_pred_lasso)
precision = precision_score(y_test_lasso, y_pred_lasso)
recall = recall_score(y_test_lasso, y_pred_lasso)
f1 = f1_score(y_test_lasso, y_pred_lasso)
roc_auc = roc_auc_score(y_test_lasso, y_pred_proba_lasso)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test_lasso, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')

##95% CI

In [None]:
# Function to calculate 95% Confidence Intervals using bootstrapping
def bootstrap_ci(metric_func, y_true, y_pred, y_proba=None, n_bootstraps=1000, alpha=0.95):
    bootstrapped_scores = []
    n_size = len(y_true)

    for i in range(n_bootstraps):
        # Resample the test data
        indices = resample(range(n_size), replace=True, n_samples=n_size, random_state=i)
        y_true_resampled = y_true[indices]
        y_pred_resampled = y_pred[indices]

        # If the metric requires probabilities (ROC AUC), pass them
        if y_proba is not None:
            y_proba_resampled = y_proba[indices]
            score = metric_func(y_true_resampled, y_proba_resampled)
        else:
            score = metric_func(y_true_resampled, y_pred_resampled)

        bootstrapped_scores.append(score)

    # Compute the confidence interval
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

# Apply bootstrap CI for each metric
accuracy_ci = bootstrap_ci(accuracy_score, y_test_lasso, y_pred_lasso)
precision_ci = bootstrap_ci(precision_score, y_test_lasso, y_pred_lasso)
recall_ci = bootstrap_ci(recall_score, y_test_lasso, y_pred_lasso)
f1_ci = bootstrap_ci(f1_score, y_test_lasso, y_pred_lasso)
roc_auc_ci = bootstrap_ci(roc_auc_score, y_test_lasso, y_pred_lasso, y_proba=y_pred_proba_lasso)

# Specificity requires calculating from confusion matrix, so custom handling
def specificity_metric(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

specificity_ci = bootstrap_ci(specificity_metric, y_test_lasso, y_pred_lasso)

# Print evaluation metrics with their 95% Confidence Intervals
print(f'Test ROC AUC: {roc_auc:.2f} (95% CI: {roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f})')
print(f'Test Accuracy: {accuracy:.2f} (95% CI: {accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f})')
print(f'Test Precision: {precision:.2f} (95% CI: {precision_ci[0]:.2f}, {precision_ci[1]:.2f})')
print(f'Test Recall: {recall:.2f} (95% CI: {recall_ci[0]:.2f}, {recall_ci[1]:.2f})')
print(f'Test F1-Score: {f1:.2f} (95% CI: {f1_ci[0]:.2f}, {f1_ci[1]:.2f})')
print(f'Test Specificity: {specificity:.2f} (95% CI: {specificity_ci[0]:.2f}, {specificity_ci[1]:.2f})')


###COMMON FEATURES BETWEEN XGBOOST, RANDOM FOREST AND LASSO

In [None]:
# Count rows in each DataFrame
count_XGBoost = selected_features_xg.shape[0]
count_rf = selected_features_rf.shape[0]
count_df = selected_features_lasso.shape[0]

# Print counts
print(f"Number of rows in selected_features_XGBoost: {count_XGBoost}")
print(f"Number of rows in selected_features_rf: {count_rf}")
print(f"Number of rows in selected_features_lasso: {count_df}")

In [None]:
#Extract feature lists from DataFrames
features_XGBoost = set(selected_features_xg)
features_rf = set(selected_features_rf)
features_df = set(selected_features_lasso)

# Find common features using set intersection
common_features = features_XGBoost & features_rf & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

In [None]:
 #Extract feature lists from DataFrames
features_XGBoost = set(selected_features_xg)
features_rf = set(selected_features_rf)


# Find common features using set intersection
common_features = features_XGBoost & features_rf

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

In [None]:
#Extract feature lists from DataFrames
features_XGBoost = set(selected_features_xg)

features_df = set(selected_features_lasso)

# Find common features using set intersection
common_features = features_XGBoost & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

In [None]:
#Extract feature lists from DataFrames

features_rf = set(selected_features_rf)
features_df = set(selected_features_lasso)

# Find common features using set intersection
common_features = features_rf & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

###AUC-ROC CURVE


In [None]:
X_train_selected_xg = selector.transform(X_train_xg)
X_test_selected_xg = selector.transform(X_test)

# Train the final model with the selected features on the training set
final_model_xg.fit(X_train_selected_xg, y_train_xg)

# Make predictions on the test set
y_pred_prob_xg = final_model_xg.predict_proba(X_test_selected_xg)[:, 1]
y_pred_xg = final_model_xg.predict(X_test_selected_xg)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_xg, y_pred_xg)
precision = precision_score(y_test_xg, y_pred_xg)
recall = recall_score(y_test_xg, y_pred_xg)
f1 = f1_score(y_test_xg, y_pred_xg)
roc_auc = roc_auc_score(y_test_xg, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test_xg, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')

In [None]:
#Random Forest
X_train_selected_rf = X_train_rf[selected_features_rf]
X_test_selected_rf = X_test_rf[selected_features_rf]

# Train the final Random Forest model with the selected features
final_model_rf.fit(X_train_selected_rf, y_train_rf)

# Perform cross-validation on the training set
cv_scores = cross_val_score(final_model_rf, X_train_selected_rf, y_train_rf, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Make predictions on the test set using the original selected features
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
y_pred_rf = final_model_rf.predict(X_test_selected_rf)

# Calculate evaluation metrics
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
precision_rf = precision_score(y_test_rf, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test_rf, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test_rf, y_pred_rf, average='weighted')
roc_auc_rf = roc_auc_score(y_test_rf, y_pred_prob_rf, average='weighted')

# Calculate confusion matrix
cm = confusion_matrix(y_test_rf, y_pred_rf)
print("Confusion Matrix:\n", cm)

# Calculate specificity
if cm.shape == (2, 2):
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    print(f'Test Specificity: {specificity:.2f}')

# Print evaluation metrics
print(f'Random Forest Test ROC AUC: {roc_auc_rf:.2f}')
print(f'Random Forest Test Accuracy: {accuracy_rf:.2f}')
print(f'Random Forest Test Precision: {precision_rf:.2f}')
print(f'Random Forest Test Recall: {recall_rf:.2f}')
print(f'Random Forest Test F1-Score: {f1_rf:.2f}')


In [None]:
#LASSO
# Train the Logistic Regression model again using only the selected features
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_lasso, y_train_lasso)

# Make predictions on the test set (predicted probabilities)
y_pred_proba_lasso = final_model_lasso.predict_proba(X_test_lasso)[:, 1]

# Convert probabilities to predicted class labels
y_pred_lasso = final_model_lasso.predict(X_test_lasso)


# Calculate evaluation metrics
accuracy = accuracy_score(y_test_lasso, y_pred_lasso)
precision = precision_score(y_test_lasso, y_pred_lasso)
recall = recall_score(y_test_lasso, y_pred_lasso)
f1 = f1_score(y_test_lasso, y_pred_lasso)
roc_auc = roc_auc_score(y_test_lasso, y_pred_proba_lasso)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test_lasso, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')

In [None]:
# Function to calculate ROC-AUC with 95% confidence intervals
def calc_auc_ci(y_true, y_pred_prob, n_bootstraps=1000, ci_level=0.95):
    bootstrapped_scores = []
    rng = np.random.RandomState(50)
    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_pred_prob), len(y_pred_prob))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = roc_auc_score(y_true[indices], y_pred_prob[indices])
        bootstrapped_scores.append(score)
    sorted_scores = np.array(bootstrapped_scores)
    sorted_scores.sort()
    ci_lower = sorted_scores[int((1.0 - ci_level) / 2.0 * len(sorted_scores))]
    ci_upper = sorted_scores[int((1.0 + ci_level) / 2.0 * len(sorted_scores))]
    return ci_lower, ci_upper

# XGBoost Model
final_model_xg = xgb.XGBClassifier(**best_params_xg)
final_model_xg.fit(X_train_selected_xg, y_train_xg)
y_pred_prob_xg = final_model_xg.predict_proba(X_test_selected_xg)[:, 1]
fpr_xg, tpr_xg, _ = roc_curve(y_test_xg, y_pred_prob_xg)
roc_auc_xg = auc(fpr_xg, tpr_xg)
ci_lower_xg, ci_upper_xg = calc_auc_ci(y_test_xg, y_pred_prob_xg)

# Random Forest Model
final_model_rf.fit(X_train_selected_rf, y_train_rf)
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test_rf, y_pred_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
ci_lower_rf, ci_upper_rf = calc_auc_ci(y_test_rf, y_pred_prob_rf)

# Lasso (Logistic Regression with L1 penalty)
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_lasso, y_train_lasso)
y_pred_prob_lasso = final_model_lasso.predict_proba(X_test_lasso)[:, 1]
fpr_lasso, tpr_lasso, _ = roc_curve(y_test_lasso, y_pred_prob_lasso)
roc_auc_lasso = auc(fpr_lasso, tpr_lasso)
ci_lower_lasso, ci_upper_lasso = calc_auc_ci(y_test_lasso, y_pred_prob_lasso)

# Plot ROC curves with confidence intervals
plt.figure(figsize=(10, 8))

# XGBoost
plt.plot(fpr_xg, tpr_xg, color='blue', lw=2, label=f'XGBoost (AUC = {roc_auc_xg:.2f} [{ci_lower_xg:.2f}-{ci_upper_xg:.2f}])')
plt.fill_between(fpr_xg, tpr_xg - (roc_auc_xg - ci_lower_xg), tpr_xg + (ci_upper_xg - roc_auc_xg), color='blue', alpha=0.2)

# Random Forest
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f} [{ci_lower_rf:.2f}-{ci_upper_rf:.2f}])')
plt.fill_between(fpr_rf, tpr_rf - (roc_auc_rf - ci_lower_rf), tpr_rf + (ci_upper_rf - roc_auc_rf), color='green', alpha=0.2)

# Lasso
plt.plot(fpr_lasso, tpr_lasso, color='red', lw=2, label=f'LASSO (AUC = {roc_auc_lasso:.2f} [{ci_lower_lasso:.2f}-{ci_upper_lasso:.2f}])')
plt.fill_between(fpr_lasso, tpr_lasso - (roc_auc_lasso - ci_lower_lasso), tpr_lasso + (ci_upper_lasso - roc_auc_lasso), color='red', alpha=0.2)

# Plot the no skill line
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--', label='No Skill')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC-AUC Curve for Metabolites (IBD)', fontsize=18)
plt.legend(loc="lower right", fontsize=12)
plt.grid(alpha=0.3)
plt.show()


In [None]:

# Close previous figures
plt.close('all')

# Plot ROC curves with confidence intervals
plt.figure(figsize=(10, 8))

# XGBoost
plt.plot(fpr_xg, tpr_xg, color='blue', lw=2, label=f'XGBoost (AUC = {roc_auc_xg:.2f} [{ci_lower_xg:.2f}-{ci_upper_xg:.2f}])')
plt.fill_between(fpr_xg, tpr_xg - (roc_auc_xg - ci_lower_xg), tpr_xg + (ci_upper_xg - roc_auc_xg), color='blue', alpha=0.2)

# Random Forest
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f} [{ci_lower_rf:.2f}-{ci_upper_rf:.2f}])')
plt.fill_between(fpr_rf, tpr_rf - (roc_auc_rf - ci_lower_rf), tpr_rf + (ci_upper_rf - roc_auc_rf), color='green', alpha=0.2)

# Lasso (Logistic Regression)
plt.plot(fpr_lasso, tpr_lasso, color='red', lw=2, label=f'LASSO (AUC = {roc_auc_lasso:.2f} [{ci_lower_lasso:.2f}-{ci_upper_lasso:.2f}])')
plt.fill_between(fpr_lasso, tpr_lasso - (roc_auc_lasso - ci_lower_lasso), tpr_lasso + (ci_upper_lasso - roc_auc_lasso), color='red', alpha=0.2)

# Plot the no skill line
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--', label='No Skill')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC-AUC Curve for Metabolites (IBD)', fontsize=18)
plt.legend(loc="lower right", fontsize=12)
plt.grid(alpha=0.3)

# Save the plot before displaying it
plt.savefig('ROC-AUC_Curve_For_IBD_Metabolites_final.png', dpi=600, bbox_inches='tight')

# Now display the plot
plt.show()


In [None]:


# Convert the selected feature lists to sets
features_XGBoost = set(selected_features_xg)
features_rf = set(selected_features_rf)
features_df = set(selected_features_lasso)

# Find common features using set intersection
common_features = features_XGBoost & features_rf & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

# Extract values of common features from the 'gene' DataFrame
common_features_values = mtb[list(common_features)]

# Print the DataFrame containing the common features and their values
print("\nValues of Common Features:")
print(common_features_values)

# Save the DataFrame to a excel file
common_features_values.to_excel('IBD Metabolites Final.xlsx', index=False)