###Import necessary libraries

In [None]:
! pip install optuna

In [None]:
import sklearn
import xgboost

print(f"Scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgboost.__version__}")


In [None]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.5.2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,os.path
import re
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from scipy.stats import bootstrap

###Load the datasets

In [None]:
gene = pd.read_excel('Top 15 GC micorbes.xlsx')
gene

In [None]:
gene_val = pd.read_csv('microbiome_validation_final.csv')
gene_val

In [None]:
# Drop the 'Sample' column for normalization
data_to_normalize = gene_val.drop(columns='Group')

# Perform Min-Max normalization
normalized_data = (data_to_normalize - data_to_normalize.min()) / (data_to_normalize.max() - data_to_normalize.min())

# Optionally, concatenate back the 'Sample' column
normalized_gene = pd.concat([gene_val[['Group']], normalized_data], axis=1)

# Display the first few rows of the normalized data
print(normalized_gene.head())


#XGBOOST(GC)

In [None]:
X = gene.drop('Group', axis=1)
y = gene['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost classifier with default parameters
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy:.4f}')
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 5),
    'min_child_weight': np.arange(1, 6),
    'reg_alpha': np.linspace(0, 1, 20),
    'reg_lambda': np.linspace(0, 1, 20),
}

# Initialize the model
xgb = XGBClassifier()

# Randomized search
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set
y_pred = random_search.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

##Bayesian Optimization

In [None]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }

    model = xgb.XGBClassifier(**params)

    # Use cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return cv_scores.mean()

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')

# Enqueue the parameters obtained from RandomizedSearchCV
study.enqueue_trial({
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'learning_rate':  0.1388888888888889,
    'gamma': 0.125,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.7222222222222222,
    'colsample_bytree': 0.5,
    'reg_alpha':   0.3684210526315789,
    'reg_lambda': 0.8421052631578947,
    'n_estimators': 170
})

# Optimize the study
study.optimize(objective, n_trials=50)

# Print the best parameters and the best score
print(f"Best Parameters: {study.best_params}")


# Train the final model with the best parameters
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")


In [None]:
selected_features = X_train.columns.tolist()

# Best parameters from Bayesian Optimization
best_params_xg = {
    'learning_rate': 0.1388888888888889,
    'max_depth': 6,
    'n_estimators': 170,
    'gamma':  0.125,
    'min_child_weight': 1.0,
    'subsample': 0.7222222222222222,
    'colsample_bytree': 0.5,
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'reg_alpha' : 0.894736842105263,
    'reg_lambda' : 0.5789473684210527
}

# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params_xg, random_state = 42)

# Perform cross-validation
cv_scores = cross_val_score(final_model_xg, X_train[selected_features], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model
final_model_xg.fit(X_train[selected_features], y_train)

# Make predictions on the test set
y_pred_xg = final_model_xg.predict(X_test[selected_features])
y_pred_prob_xg = final_model_xg.predict_proba(X_test[selected_features])[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg, average="weighted", zero_division=1)
recall = recall_score(y_test, y_pred_xg)
f1 = f1_score(y_test, y_pred_xg)
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##XGBoost Validation

In [None]:
X_val_gene = normalized_gene.drop('Group', axis=1)
y_val_gene = normalized_gene['Group']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded_gene = label_encoder.fit_transform(y_val_gene)

# Check for missing features and add them with zero values if necessary
missing_features = [feature for feature in selected_features if feature not in X_val_gene.columns]
missing_df = pd.DataFrame(0.0, index=X_val_gene.index, columns=missing_features)
X_val_gene = pd.concat([X_val_gene, missing_df], axis=1)

# Ensure the columns are in the same order as the training features
X_val_gene = X_val_gene[selected_features]
xg_model.fit(X_val_gene, y_encoded_gene)


y_pred_prob_xg_gene = xg_model.predict_proba(X_val_gene)[:, 1]
y_pred_xg_gene = xg_model.predict(X_val_gene)
y_pred_prob = final_model_xg.predict_proba(X_val_gene)[:, 1]
y_pred_ = final_model_xg.predict(X_val_gene)
roc_auc_val_xg = roc_auc_score(y_encoded_gene, y_pred_prob)


conf_matrix= confusion_matrix(y_encoded_gene, y_pred)
if conf_matrix.shape == (2, 2):
    tn_val_xg, fp_val_xg, fn_val_xg, tp_val_xg = conf_matrix.ravel()
    specificity_val_xg = tn_val_xg / (tn_val_xg + fp_val_xg)
else:
    specificity_val_xg = 'N/A'


accuracy_val_xg = accuracy_score(y_encoded_gene, y_pred_xg_gene)
precision_val_xg = precision_score(y_encoded_gene, y_pred_xg_gene, average="weighted", zero_division=1)
recall_val_xg = recall_score(y_encoded_gene, y_pred_xg_gene, average="weighted")
f1_val_xg = f1_score(y_encoded_gene, y_pred_xg_gene, average="weighted")

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val_xg:.2f}')
print(f'Validation Accuracy: {accuracy_val_xg:.2f}')
print(f'Validation Precision: {precision_val_xg:.2f}')
print(f'Validation Recall: {recall_val_xg:.2f}')
print(f'Validation F1-Score: {f1_val_xg:.2f}')
print(f'Validation Specificity: {specificity_val_xg:.2f}')


##95% CI

In [None]:
# Function to calculate metrics on a bootstrap sample
def calculate_metrics(X, y, model, xg_model):
    y_pred_prob = model.predict_proba(X)[:, 1]
    y_pred = model.predict(X)

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y, y_pred_prob)

    # Calculate confusion matrix for specificity
    conf_matrix = confusion_matrix(y, y_pred)
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    else:
        specificity = np.nan

    # Get predictions for other metrics
    y_pred = xg_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average="weighted", zero_division=1)
    recall = recall_score(y, y_pred, average="weighted")
    f1 = f1_score(y, y_pred, average="weighted")

    return roc_auc, accuracy, precision, recall, f1, specificity

# Number of bootstrap samples
n_bootstraps = 1000
metrics = {"roc_auc": [], "accuracy": [], "precision": [], "recall": [], "f1": [], "specificity": []}

# Perform bootstrapping
for i in range(n_bootstraps):
    # Resample with replacement
    indices = np.random.choice(range(len(y_encoded_gene)), len(y_encoded_gene), replace=True)
    X_resampled = X_val_gene.iloc[indices]
    y_resampled = y_encoded_gene[indices]

    # Calculate metrics on the bootstrap sample
    roc_auc, accuracy, precision, recall, f1, specificity = calculate_metrics(X_resampled, y_resampled, final_model_xg, xg_model)

    # Store results
    metrics["roc_auc"].append(roc_auc)
    metrics["accuracy"].append(accuracy)
    metrics["precision"].append(precision)
    metrics["recall"].append(recall)
    metrics["f1"].append(f1)
    metrics["specificity"].append(specificity)

# Calculate 95% confidence intervals
ci_results = {metric: (np.percentile(scores, 2.5), np.percentile(scores, 97.5)) for metric, scores in metrics.items()}

# Print the results with 95% CI
print(f'Validation ROC AUC: (95% CI: {ci_results["roc_auc"][0]:.2f} - {ci_results["roc_auc"][1]:.2f})')
print(f'Validation Accuracy:  (95% CI: {ci_results["accuracy"][0]:.2f} - {ci_results["accuracy"][1]:.2f})')
print(f'Validation Precision: (95% CI: {ci_results["precision"][0]:.2f} - {ci_results["precision"][1]:.2f})')
print(f'Validation Recall:  (95% CI: {ci_results["recall"][0]:.2f} - {ci_results["recall"][1]:.2f})')
print(f'Validation F1-Score: (95% CI: {ci_results["f1"][0]:.2f} - {ci_results["f1"][1]:.2f})')
print(f'Validation Specificity: (95% CI: {ci_results["specificity"][0]:.2f} - {ci_results["specificity"][1]:.2f})')


##Random Forest

In [None]:
X = gene.drop('Group', axis=1)
y = gene['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Calculate F1 score on the test set
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Test F1 Score: {test_f1:.4f}")

# Calculate precision on the test set
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f"Test Precision: {test_precision:.4f}")

# Calculate recall on the test set
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f"Test Recall: {test_recall:.4f}")

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                               n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV to the data
rf_random.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(rf_random.best_params_)

# Predict on the test data
y_pred = rf_random.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


##Bayesian Optimization

In [None]:
# Define the best parameters found from RandomizedSearchCV RF
best_params_random = {
    'n_estimators': 100,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 90,
    'bootstrap': False
}

def objective(trial):
    # Define the search space based on the best parameters from RandomizedSearchCV
    n_estimators = trial.suggest_int('n_estimators', max(100, best_params_random['n_estimators'] - 200), best_params_random['n_estimators'] + 200)
    min_samples_split = trial.suggest_int('min_samples_split', max(2, best_params_random['min_samples_split'] - 3), best_params_random['min_samples_split'] + 3)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', max(1, best_params_random['min_samples_leaf'] - 2), best_params_random['min_samples_leaf'] + 2)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_int('max_depth', max(5, best_params_random['max_depth'] - 10), best_params_random['max_depth'] + 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Initialize RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_depth=max_depth,
        bootstrap=bootstrap,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    cv_scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    return np.mean(cv_scores)

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')

# Enqueue the trial with the best parameters from RandomizedSearchCV
study.enqueue_trial(best_params_random)
study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = RandomForestClassifier(**best_params_optuna, random_state=50)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


In [None]:
selected_features_rf = X_train.columns.tolist()

# Best parameters from Bayesian Optimization RF
best_params_rf = {
    'n_estimators': 286,
    'min_samples_split': 8,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 95,
    'bootstrap': True
}


# Create the Random Forest classifier with the best parameters
final_model_rf = RandomForestClassifier(**best_params_rf, random_state=42)


# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_rf, X_train[selected_features_rf], y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the final model on the entire training data
final_model_rf.fit(X_train[selected_features_rf], y_train)

# Make predictions on the test set
y_pred_rf = final_model_rf.predict(X_test[selected_features_rf])
y_pred_prob_rf = final_model_rf.predict_proba(X_test[selected_features_rf])[:, 1]

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')
f1 = f1_score(y_test, y_pred_rf, average='weighted')
auc_roc = roc_auc_score(y_test, y_pred_prob_rf)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
specificity = tn / (tn + fp)

# Print the results for test data
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test AUC-ROC: {auc_roc:.2f}')
print(f'Test Specificity: {specificity:.2f}')


##Random Forest Validation

In [None]:
# Validation set
X_val_gene_rf = gene_val.drop('Group', axis=1)
y_val_gene_rf = gene_val['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_val_gene_rf)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_rf if feature not in X_val_gene_rf.columns]

# Add the missing features to the validation set with zero values using pd.concat
missing_df_rf = pd.DataFrame(0, index=X_val_gene_rf.index, columns=missing_features)
X_val_gene_rf = pd.concat([X_val_gene_rf, missing_df_rf], axis=1)

# Ensure the columns are in the same order as the training features
X_val_gene_rf = X_val_gene_rf[selected_features_rf]
# Make predictions on the validation set without converting to NumPy array
y_pred_prob_rf_gene = final_model_rf.predict_proba(X_val_gene_rf)[:, 1]
y_pred_rf_gene = final_model_rf.predict(X_val_gene_rf)

# Calculate evaluation metrics
accuracy_val = accuracy_score(y_encoded, y_pred_rf_gene)
precision_val = precision_score(y_encoded, y_pred_rf_gene, average='weighted', zero_division=1)
recall_val = recall_score(y_encoded, y_pred_rf_gene, average='weighted')
f1_val = f1_score(y_encoded, y_pred_rf_gene, average='weighted')
roc_auc_val = roc_auc_score(y_encoded, y_pred_prob_rf_gene)

# Calculate specificity
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_encoded, y_pred_rf_gene).ravel()
specificity_val = tn_val / (tn_val + fp_val)

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val:.2f}')
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(f'Validation Precision: {precision_val:.2f}')
print(f'Validation Recall: {recall_val:.2f}')
print(f'Validation F1-Score: {f1_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')

##95% CI

In [None]:
# Function to compute metrics
def compute_metrics(y_true, y_pred, y_pred_proba):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # Confusion matrix to compute specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)

    return accuracy, precision, recall, f1, roc_auc, specificity

# Number of bootstrap iterations
n_iterations = 1000
n_size = len(X_val_mtb_rf)

# Initialize lists to store metric values for each bootstrap sample
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []
specificity_scores = []

# Bootstrap procedure
for i in range(n_iterations):
    # Resample the validation set with replacement
    X_resample, y_resample = resample(X_val_mtb_rf, y_encoded_rf, n_samples=n_size, random_state=i)

    # Make predictions on the resampled data
    y_pred_resample = final_model_rf.predict(X_resample)
    y_pred_proba_resample = final_model_rf.predict_proba(X_resample)[:, 1]

    # Calculate metrics for this bootstrap sample
    accuracy, precision, recall, f1, roc_auc, specificity = compute_metrics(y_resample, y_pred_resample, y_pred_proba_resample)

    # Store the metrics
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)
    specificity_scores.append(specificity)

# Calculate 95% confidence intervals for each metric
def calculate_confidence_interval(scores):
    lower_bound = np.percentile(scores, 2.5)
    upper_bound = np.percentile(scores, 97.5)
    return lower_bound, upper_bound

# Calculate and print 95% confidence intervals
accuracy_ci = calculate_confidence_interval(accuracy_scores)
precision_ci = calculate_confidence_interval(precision_scores)
recall_ci = calculate_confidence_interval(recall_scores)
f1_ci = calculate_confidence_interval(f1_scores)
roc_auc_ci = calculate_confidence_interval(roc_auc_scores)
specificity_ci = calculate_confidence_interval(specificity_scores)

print(f'Validation ROC AUC: {roc_auc_val:.2f}, 95% CI: [{roc_auc_ci[0]:.2f}, {roc_auc_ci[1]:.2f}]')
print(f'Validation Accuracy: {accuracy_val:.2f}, 95% CI: [{accuracy_ci[0]:.2f}, {accuracy_ci[1]:.2f}]')
print(f'Validation Precision: {precision_val:.2f}, 95% CI: [{precision_ci[0]:.2f}, {precision_ci[1]:.2f}]')
print(f'Validation Recall: {recall_val:.2f}, 95% CI: [{recall_ci[0]:.2f}, {recall_ci[1]:.2f}]')
print(f'Validation F1-Score: {f1_val:.2f}, 95% CI: [{f1_ci[0]:.2f}, {f1_ci[1]:.2f}]')
print(f'Validation Specificity: {specificity_val:.2f}, 95% CI: [{specificity_ci[0]:.2f}, {specificity_ci[1]:.2f}]')


#LASSO

In [None]:
X = gene.drop('Group', axis=1)
y = gene['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Train the classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear'],
    'max_iter': [1000, 1500, 2000],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

# Create Logistic Regression classifier with L1 regularization (Lasso)
log_reg = LogisticRegression(penalty='l1', random_state=42)

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    log_reg, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the Randomized Search model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')



##Bayesian Optimization

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the best parameters found from previous searches lasso
best_params_random = {
    'C': 10000.0,
    'max_iter': 1000,
    'tol': 0.01,
    'solver': 'liblinear'
}

def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    max_iter = trial.suggest_int('max_iter', 2000, 5000)
    tol = trial.suggest_float('tol', 1e-4, 1e-2, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Initialize LogisticRegression with suggested hyperparameters
    clf = LogisticRegression(
        penalty='l1',
        C=C,
        max_iter=max_iter,
        tol=tol,
        solver=solver,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()
    return score

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = LogisticRegression(penalty='l1', **best_params_optuna, random_state=42)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")



In [None]:
selected_features_lasso = X_train.columns.tolist()

# Best parameters from Bayesian Optimization with Optuna
best_params_lasso = {
    'C': 94.3983122760219,
    'max_iter': 3560,
    'solver': 'liblinear',
    'tol':  0.0011598126302544468
}
# Train the Logistic Regression model again using only the selected features
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)


# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model_lasso, X_train[selected_features_lasso], y_train, cv=10, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

final_model_lasso.fit(X_train[selected_features_lasso], y_train)

# Make predictions on the test set (predicted probabilities)
y_pred_proba_lasso = final_model_lasso.predict_proba(X_test[selected_features_lasso])[:, 1]

# Convert probabilities to predicted class labels
y_pred_lasso = final_model_lasso.predict(X_test[selected_features_lasso])


# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_lasso)
precision = precision_score(y_test, y_pred_lasso)
recall = recall_score(y_test, y_pred_lasso)
f1 = f1_score(y_test, y_pred_lasso)
roc_auc = roc_auc_score(y_test, y_pred_proba_lasso)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)
print(f'Test ROC AUC: {roc_auc:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Precision: {precision:.2f}')
print(f'Test Recall: {recall:.2f}')
print(f'Test F1-Score: {f1:.2f}')
print(f'Test Specificity: {specificity:.2f}')



##LASSO Validation

In [None]:
# Validation set
X_val_gene_lasso = gene_val.drop('Group', axis=1)
y_val_gene_lasso = gene_val['Group']


# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_val_gene_lasso)

# Identify the features that are missing in the validation set
missing_features = [feature for feature in selected_features_lasso if feature not in X_val_gene_lasso.columns]

# Add the missing features to the validation set with zero values using pd.concat
missing_df_lasso = pd.DataFrame(0, index=X_val_gene_lasso.index, columns=missing_features)
X_val_gene_lasso = pd.concat([X_val_gene_lasso, missing_df_lasso], axis=1)

# Ensure the columns are in the same order as the training features
X_val_gene_lasso = X_val_gene_lasso[selected_features_lasso]


#Make predictions on the validation set
y_pred_prob_lasso_gene = final_model_lasso.predict_proba(X_val_gene_lasso)[:, 1]
y_pred_lasso_gene = final_model_lasso.predict(X_val_gene_lasso)


# Calculate evaluation metrics
accuracy_val = accuracy_score(y_encoded, y_pred_lasso_gene)
precision_val = precision_score(y_encoded, y_pred_lasso_gene, average = 'weighted', zero_division=1)
recall_val = recall_score(y_encoded, y_pred_lasso_gene, average="weighted")
f1_val = f1_score(y_encoded, y_pred_lasso_gene,  average="weighted")
roc_auc_val = roc_auc_score(y_encoded, y_pred_prob_lasso_gene)

# Calculate specificity
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_encoded, y_pred_lasso_gene).ravel()
specificity_val = tn_val / (tn_val + fp_val)

# Print evaluation metrics
print(f'Validation ROC AUC: {roc_auc_val:.2f}')
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(f'Validation Precision: {precision_val:.2f}')
print(f'Validation Recall: {recall_val:.2f}')
print(f'Validation F1-Score: {f1_val:.2f}')
print(f'Validation Specificity: {specificity_val:.2f}')

##95% CI

In [None]:
n_iterations = 1000
metrics = {
    'roc_auc': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'specificity': []
}

# Bootstrapping
for i in range(n_iterations):
    # Resample with replacement
    X_resampled, y_resampled = resample(X_val_mtb_lasso, y_encoded, stratify=y_encoded)

    # Make predictions
    y_pred_prob_resampled = final_model_lasso.predict_proba(X_resampled)[:, 1]
    y_pred_resampled = final_model_lasso.predict(X_resampled)

    # Compute metrics for the resampled data
    metrics['roc_auc'].append(roc_auc_score(y_resampled, y_pred_prob_resampled))
    metrics['accuracy'].append(accuracy_score(y_resampled, y_pred_resampled))
    metrics['precision'].append(precision_score(y_resampled, y_pred_resampled, average='weighted', zero_division=1))
    metrics['recall'].append(recall_score(y_resampled, y_pred_resampled, average='weighted'))
    metrics['f1'].append(f1_score(y_resampled, y_pred_resampled, average='weighted'))

    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_resampled, y_pred_resampled).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    metrics['specificity'].append(specificity)

# Calculate 95% CIs for each metric
confidence_intervals = {metric: (np.percentile(values, 2.5), np.percentile(values, 97.5))
                        for metric, values in metrics.items()}

# Print metrics and their confidence intervals
print(f'Validation ROC AUC: {roc_auc_val:.2f}, 95% CI: {confidence_intervals["roc_auc"]}')
print(f'Validation Accuracy: {accuracy_val:.2f}, 95% CI: {confidence_intervals["accuracy"]}')
print(f'Validation Precision: {precision_val:.2f}, 95% CI: {confidence_intervals["precision"]}')
print(f'Validation Recall: {recall_val:.2f}, 95% CI: {confidence_intervals["recall"]}')
print(f'Validation F1-Score: {f1_val:.2f}, 95% CI: {confidence_intervals["f1"]}')
print(f'Validation Specificity: {specificity_val:.2f}, 95% CI: {confidence_intervals["specificity"]}')

