###Import necessary libraries

In [None]:
! pip install optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,os.path
import re
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from scipy.stats import bootstrap

###Load the datasets

In [None]:
mtb = pd.read_csv("mtb_un.tsv", sep='\t')
mtb

#XGBOOST(GC)

In [None]:
X = mtb.drop('Group', axis=1)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize XGBoost classifier with default parameters
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy:.4f}')
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")

##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': np.linspace(0.5, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.5, 5),
    'min_child_weight': np.arange(1, 6)
}

# Initialize the model
xgb = XGBClassifier()

# Randomized search
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set
y_pred = random_search.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

##Bayesian Optimization

In [None]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'binary:logistic',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }

    model = xgb.XGBClassifier(**params)

    # Use cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return cv_scores.mean()

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')


# Enqueue the parameters obtained from RandomizedSearchCV XGBOOST
study.enqueue_trial({
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'learning_rate': 0.20333333333333334 ,
    'gamma': 0.375,
    'max_depth': 4,
    'min_child_weight': 2,
    'subsample': 0.5555555555555556,
    'colsample_bytree': 0.7222222222222222,
    'n_estimators': 60
})

# Optimize the study
study.optimize(objective, n_trials=50)

# Print the best parameters and the best score
print(f"Best Parameters: {study.best_params}")
print(f"Best Cross-validation Accuracy: {study.best_value:.4f}")

# Train the final model with the best parameters
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Evaluate the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Recall: {recall:.4f}")


In [None]:
# Best parameters from Bayesian Optimization XG
best_params = {
    'learning_rate': 0.09725914359015418,
    'max_depth': 3,
    'n_estimators': 254,
    'gamma':  0.478569495214895,
    'min_child_weight': 1.2576856857820942,
    'subsample': 0.9686654550695184,
    'colsample_bytree': 0.6498442366934263,
    'objective': 'binary:logistic',
    'booster': 'gbtree',

}

# Create the XGBoost classifier with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params)

cv_scores = cross_val_score(final_model_xg, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the model
final_model_xg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model_xg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')


##Feature Selection XGBoost (RFECV)

In [None]:
# Define the feature selector with the best parameters
final_model_xg = xgb.XGBClassifier(**best_params)
selector = RFECV(estimator=final_model_xg, step=1, cv=StratifiedKFold(10), scoring='roc_auc', verbose=2)

# Fit the RFECV selector on the training data
selector.fit(X_train, y_train)

# Print the optimal number of features
print(f"Optimal number of features: {selector.n_features_}")

# Get the selected features
selected_features_mask = selector.support_
selected_features = X.columns[selected_features_mask]

print("Selected features:")
for feature in selected_features:
    print(feature)

# Store the selected features in a DataFrame
selected_features_XGBoost = pd.DataFrame(selected_features, columns=["Selected Features"])

# To store the DataFrame with the selected features
XGBoost_gene = X_train[selected_features]

In [None]:
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Train the final model with the selected features
final_model_xg.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred_prob_xg = final_model_xg.predict_proba(X_test_selected)[:, 1]
y_pred_xg = final_model_xg.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg)
recall = recall_score(y_test, y_pred_xg)
f1 = f1_score(y_test, y_pred_xg)
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')
print(f'Test Specificity: {specificity:.4f}')


##95% CI

In [None]:
def bootstrap_metric(y_true, y_pred, y_pred_prob, n_iterations=1000, alpha=0.95):
    metrics = {'accuracy': [],
               'precision': [],
               'recall': [],
               'f1': [],
               'roc_auc': [],
               'specificity': []}

    n_size = len(y_true)

    for i in range(n_iterations):
        # Bootstrap sample
        indices = resample(np.arange(n_size), n_samples=n_size, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        y_pred_prob_boot = y_pred_prob[indices]

        # Calculate metrics
        accuracy = accuracy_score(y_true_boot, y_pred_boot)
        precision = precision_score(y_true_boot, y_pred_boot)
        recall = recall_score(y_true_boot, y_pred_boot)
        f1 = f1_score(y_true_boot, y_pred_boot)
        roc_auc = roc_auc_score(y_true_boot, y_pred_prob_boot)

        tn, fp, fn, tp = confusion_matrix(y_true_boot, y_pred_boot).ravel()
        specificity = tn / (tn + fp)

        # Store results
        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['roc_auc'].append(roc_auc)
        metrics['specificity'].append(specificity)

    # Calculate confidence intervals
    ci = {}
    for metric in metrics:
        lower = np.percentile(metrics[metric], (1 - alpha) / 2 * 100)
        upper = np.percentile(metrics[metric], (1 + alpha) / 2 * 100)
        ci[metric] = (lower, upper)

    return ci


y_test = np.array(y_test)
y_pred_xg = np.array(y_pred_xg)
y_pred_prob_xg = np.array(y_pred_prob_xg)

# Calculate metrics on the original data
accuracy = accuracy_score(y_test, y_pred_xg)
precision = precision_score(y_test, y_pred_xg)
recall = recall_score(y_test, y_pred_xg)
f1 = f1_score(y_test, y_pred_xg)
roc_auc = roc_auc_score(y_test, y_pred_prob_xg)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_xg).ravel()
specificity = tn / (tn + fp)

# Calculate confidence intervals
ci = bootstrap_metric(y_test, y_pred_xg, y_pred_prob_xg, n_iterations=1000, alpha=0.95)

# Print scores and confidence intervals
print(f'Test ROC AUC: {roc_auc:.4f} (95% CI: {ci["roc_auc"][0]:.4f}, {ci["roc_auc"][1]:.4f})')
print(f'Test Accuracy: {accuracy:.4f} (95% CI: {ci["accuracy"][0]:.4f}, {ci["accuracy"][1]:.4f})')
print(f'Test Precision: {precision:.4f} (95% CI: {ci["precision"][0]:.4f}, {ci["precision"][1]:.4f})')
print(f'Test Recall: {recall:.4f} (95% CI: {ci["recall"][0]:.4f}, {ci["recall"][1]:.4f})')
print(f'Test F1-Score: {f1:.4f} (95% CI: {ci["f1"][0]:.4f}, {ci["f1"][1]:.4f})')
print(f'Test Specificity: {specificity:.4f} (95% CI: {ci["specificity"][0]:.4f}, {ci["specificity"][1]:.4f})')


#Random Forest

In [None]:
X = mtb.drop('Group', axis=1)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Calculate F1 score on the test set
test_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Test F1 Score: {test_f1:.4f}")

# Calculate precision on the test set
test_precision = precision_score(y_test, y_pred, average='weighted')
print(f"Test Precision: {test_precision:.4f}")

# Calculate recall on the test set
test_recall = recall_score(y_test, y_pred, average='weighted')
print(f"Test Recall: {test_recall:.4f}")




##Random Search

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                               n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV to the data
rf_random.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print("Best parameters found by RandomizedSearchCV:")
print(rf_random.best_params_)

# Predict on the test data
y_pred = rf_random.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")



##Bayesian Optimization

In [None]:
# Define the best parameters found from RandomizedSearchCV RF
best_params_random = {
    'n_estimators': 400,
    'min_samples_split': 2,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 30,
    'bootstrap': False
}

def objective(trial):
    # Define the search space based on the best parameters from RandomizedSearchCV
    n_estimators = trial.suggest_int('n_estimators', max(100, best_params_random['n_estimators'] - 200), best_params_random['n_estimators'] + 200)
    min_samples_split = trial.suggest_int('min_samples_split', max(2, best_params_random['min_samples_split'] - 3), best_params_random['min_samples_split'] + 3)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', max(1, best_params_random['min_samples_leaf'] - 2), best_params_random['min_samples_leaf'] + 2)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_int('max_depth', max(5, best_params_random['max_depth'] - 10), best_params_random['max_depth'] + 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Initialize RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_depth=max_depth,
        bootstrap=bootstrap,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(cv_scores)

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')

# Enqueue the trial with the best parameters from RandomizedSearchCV
study.enqueue_trial(best_params_random)
study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = RandomForestClassifier(**best_params_optuna, random_state=42)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


In [None]:
# Best parameters from Bayesian Optimization RF
best_params = {
    'n_estimators': 400,
    'min_samples_split': 2,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 30,
    'bootstrap': False
}

# Create the Random Forest classifier with the best parameters
final_model = RandomForestClassifier(**best_params, random_state=42)


# Perform cross-validation with 5 folds
cv_scores = cross_val_score(final_model, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")

# Train the final model on the entire training data
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results for test data
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')



##Feature Selection For RF (RFECV)

In [None]:
# Define the Random Forest model with the best parameters
final_model_rf= RandomForestClassifier(**best_params, random_state=42)

# Define the RFECV selector
selector_rf = RFECV(estimator=final_model_rf, step=6, cv=StratifiedKFold(10), scoring='roc_auc', verbose=2)

# Fit the RFECV selector on the training data
selector_rf.fit(X_train, y_train)

# Print the optimal number of features
print(f"Optimal number of features: {selector_rf.n_features_}")

# Get the selected features
selected_features_mask_rf = selector_rf.support_
selected_features_rf = X.columns[selected_features_mask_rf]

# Print the selected features
print("Selected features:", selected_features_rf)
RandomForest_gene = X[selected_features_rf]

In [None]:
X_train_selected_rf = selector_rf.transform(X_train)
X_test_selected_rf = selector_rf.transform(X_test)

# Train the final Random Forest model with the selected features
final_model_rf.fit(X_train_selected_rf, y_train)

# Make predictions on the test set
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
y_pred_rf = final_model_rf.predict(X_test_selected_rf)

# Calculate evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

# Calculate confusion matrix and extract TN, FP, FN, TP
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_test, y_pred_rf).ravel()

# Calculate specificity
specificity_rf = tn_rf / (tn_rf + fp_rf)

print(f'Random Forest Test ROC AUC: {roc_auc_rf:.4f}')
print(f'Random Forest Test Accuracy: {accuracy_rf:.4f}')
print(f'Random Forest Test Precision: {precision_rf:.4f}')
print(f'Random Forest Test Recall: {recall_rf:.4f}')
print(f'Random Forest Test F1-Score: {f1_rf:.4f}')
print(f'Random Forest Test Specificity: {specificity:.4f}')

##95% CI

In [None]:
def bootstrap_metric(y_true, y_pred, y_pred_prob, n_iterations=1000, alpha=0.95):
    metrics = {'accuracy': [],
               'precision': [],
               'recall': [],
               'f1': [],
               'roc_auc': [],
               'specificity': []}

    n_size = len(y_true)

    for i in range(n_iterations):
        # Bootstrap sample
        indices = resample(np.arange(n_size), n_samples=n_size, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        y_pred_prob_boot = y_pred_prob[indices]

        # Calculate metrics
        accuracy = accuracy_score(y_true_boot, y_pred_boot)
        precision = precision_score(y_true_boot, y_pred_boot)
        recall = recall_score(y_true_boot, y_pred_boot)
        f1 = f1_score(y_true_boot, y_pred_boot)
        roc_auc = roc_auc_score(y_true_boot, y_pred_prob_boot)

        tn, fp, fn, tp = confusion_matrix(y_true_boot, y_pred_boot).ravel()
        specificity = tn / (tn + fp)

        # Store results
        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['roc_auc'].append(roc_auc)
        metrics['specificity'].append(specificity)

    # Calculate confidence intervals
    ci = {}
    for metric in metrics:
        lower = np.percentile(metrics[metric], (1 - alpha) / 2 * 100)
        upper = np.percentile(metrics[metric], (1 + alpha) / 2 * 100)
        ci[metric] = (lower, upper)

    return ci


y_test = np.array(y_test)
y_pred_rf = np.array(y_pred_rf)
y_pred_prob_rf = np.array(y_pred_prob_rf)

# Calculate metrics on the original data
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
roc_auc = roc_auc_score(y_test, y_pred_prob_rf)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
specificity = tn / (tn + fp)

# Calculate confidence intervals
ci = bootstrap_metric(y_test, y_pred_rf, y_pred_prob_rf, n_iterations=1000, alpha=0.95)

# Print scores and confidence intervals
print(f'Test ROC AUC: {roc_auc:.4f} (95% CI: {ci["roc_auc"][0]:.4f}, {ci["roc_auc"][1]:.4f})')
print(f'Test Accuracy: {accuracy:.4f} (95% CI: {ci["accuracy"][0]:.4f}, {ci["accuracy"][1]:.4f})')
print(f'Test Precision: {precision:.4f} (95% CI: {ci["precision"][0]:.4f}, {ci["precision"][1]:.4f})')
print(f'Test Recall: {recall:.4f} (95% CI: {ci["recall"][0]:.4f}, {ci["recall"][1]:.4f})')
print(f'Test F1-Score: {f1:.4f} (95% CI: {ci["f1"][0]:.4f}, {ci["f1"][1]:.4f})')
print(f'Test Specificity: {specificity:.4f} (95% CI: {ci["specificity"][0]:.4f}, {ci["specificity"][1]:.4f})')

#LASSO

In [None]:
X = mtb.drop('Group', axis=1)
y = mtb['Group']

# Encode categorical target labels into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create Logistic Regression classifier with L1 regularization
log_reg = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Train the classifier
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear'],
    'max_iter': [100, 200, 300, 500, 1000],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}

# Create Logistic Regression classifier with L1 regularization
log_reg = LogisticRegression(penalty='l1', random_state=42)

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    log_reg, param_distributions=param_dist, n_iter=100,
    scoring='roc_auc', cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the Randomized Search model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
y_pred = random_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


##Bayesian Optimization

In [None]:
# Define the best parameters found from previous searches
best_params_random = {
    'C': 10000.0,
    'max_iter': 1000,
    'tol': 0.0001,
    'solver': 'liblinear'
}

def objective(trial):
    # Define the search space based on the best parameters from previous searches
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    max_iter = trial.suggest_int('max_iter', 1000, 50000)
    tol = trial.suggest_loguniform('tol', 1e-4, 1e-2)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Initialize LogisticRegression with suggested hyperparameters
    clf = LogisticRegression(
        penalty='l1',
        C=C,
        max_iter=max_iter,
        tol=tol,
        solver=solver,
        random_state=42
    )

    # Use cross-validation to evaluate the classifier
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()

    return score

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and best score from Optuna
print("Best Parameters from Optuna:", study.best_params)
print("Best Accuracy from Optuna:", study.best_value)

# Retrieve the best model and evaluate on the test set
best_params_optuna = study.best_params
best_clf = LogisticRegression(penalty='l1', **best_params_optuna, random_state=42)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

# Evaluate on test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

In [None]:
# Best parameters from Bayesian Optimization with Optuna
best_params_lasso = {
    'C': 31.8,
    'max_iter': 42551 ,
    'solver': 'saga',
    'tol': 0.0006

}

# Create the Logistic Regression classifier with Lasso (L1) regularization and the best parameters
lasso_model = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:")
print(cv_scores)
print(f"Mean CV accuracy: {np.mean(cv_scores):.4f}")


# Train the Lasso model on the entire training data
lasso_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test)

# Calculate evaluation metrics on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results for test data
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')


In [None]:
lasso_model = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)

# Fit the final model on the entire training data
lasso_model.fit(X_train, y_train)

# Extracting feature names from original DataFrame
feature_names = X.columns

#Print out selected features based on non-zero coefficients
selected_features = feature_names[np.abs(lasso_model.coef_[0]) > 0]
selected_coefficients = lasso_model.coef_[0][np.abs(lasso_model.coef_[0]) > 0]

print("Selected features and coefficients:")
for feature, coef in zip(selected_features, selected_coefficients):
    print(f"{feature}: {coef:.4f}")

# Optionally, print the number of selected features
print(f"Number of selected features: {len(selected_features)}")

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create the Logistic Regression classifier with L1 penalty and best parameters
lasso_model = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)

# Fit the final model on the entire training data
lasso_model.fit(X_train, y_train)

# Extracting feature names from original DataFrame
feature_names = X.columns

# Print out selected features based on non-zero coefficients
selected_features = feature_names[np.abs(lasso_model.coef_[0]) > 0]

# Transform the training and test sets to use only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train the Logistic Regression model again using only the selected features
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred_lasso = final_model_lasso.predict(X_test_selected)
y_pred_prob_lasso = final_model_lasso.predict_proba(X_test_selected)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_lasso)
precision = precision_score(y_test, y_pred_lasso, average ='weighted')
recall = recall_score(y_test, y_pred_lasso, average='weighted')
f1 = f1_score(y_test, y_pred_lasso, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_prob_lasso)


# Calculate confusion matrix and extract TN, FP, FN, TP
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lasso).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

print(f'Test ROC AUC: {roc_auc:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')
print(f'Test Specificity: {specificity:.4f}')



##95% CI

In [None]:
# Bootstrapping function to calculate confidence intervals
def bootstrap_metric(y_true, y_pred, y_pred_prob, metric_func, n_iterations=1000, alpha=0.95):
    metrics = []
    n_size = len(y_true)

    for _ in range(n_iterations):
        indices = np.random.randint(0, n_size, n_size)
        if metric_func == roc_auc_score:
            score = metric_func(y_true[indices], y_pred_prob[indices])
        elif metric_func == specificity_score:
            score = metric_func(y_true[indices], y_pred[indices])
        elif metric_func in [precision_score, recall_score, f1_score]:
            score = metric_func(y_true[indices], y_pred[indices], zero_division=0)
        else:
            score = metric_func(y_true[indices], y_pred[indices])
        metrics.append(score)

    lower = np.percentile(metrics, ((1.0 - alpha) / 2.0) * 100)
    upper = np.percentile(metrics, (alpha + ((1.0 - alpha) / 2.0)) * 100)
    return lower, upper

# Calculate confidence intervals for each metric
accuracy_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, accuracy_score)
precision_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, precision_score)
recall_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, recall_score)
f1_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, f1_score)
roc_auc_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, roc_auc_score)
specificity_ci = bootstrap_metric(y_test, y_pred_lasso, y_pred_prob_lasso, specificity_score)

# Print evaluation metrics with confidence intervals
print(f'Test ROC AUC: {roc_auc:.4f} (95% CI: {roc_auc_ci[0]:.4f} - {roc_auc_ci[1]:.4f})')
print(f'Test Accuracy: {accuracy:.4f} (95% CI: {accuracy_ci[0]:.4f} - {accuracy_ci[1]:.4f})')
print(f'Test Precision: {precision:.4f} (95% CI: {precision_ci[0]:.4f} - {precision_ci[1]:.4f})')
print(f'Test Recall: {recall:.4f} (95% CI: {recall_ci[0]:.4f} - {recall_ci[1]:.4f})')
print(f'Test F1-Score: {f1:.4f} (95% CI: {f1_ci[0]:.4f} - {f1_ci[1]:.4f})')
print(f'Test Specificity: {specificity:.4f} (95% CI: {specificity_ci[0]:.4f} - {specificity_ci[1]:.4f})')


###COMMON FEATURES BETWEEN XGBOOST, RANDOM FOREST AND LASSO

In [None]:
# Count rows in each DataFrame
count_XGBoost = selected_features_XGBoost.shape[0]
count_rf = selected_features_rf.shape[0]
count_df = selected_features_df.shape[0]

# Print counts
print(f"Number of rows in selected_features_XGBoost: {count_XGBoost}")
print(f"Number of rows in selected_features_rf: {count_rf}")
print(f"Number of rows in selected_features_df: {count_df}")


In [None]:
#Extract feature lists from DataFrames
features_XGBoost = set(selected_features_XGBoost['Selected Features'])
features_rf = set(selected_features_rf)
features_df = set(selected_features_df['Feature'])

# Find common features using set intersection
common_features = features_XGBoost & features_rf & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

In [None]:
#Extract feature lists from DataFrames
features_XGBoost = set(selected_features_XGBoost['Selected Features'])
features_rf = set(selected_features_rf)


# Find common features using set intersection
common_features = features_XGBoost & features_rf

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")


In [None]:
#Extract feature lists from DataFrames
features_XGBoost = set(selected_features_XGBoost['Selected Features'])

features_df = set(selected_features_df['Feature'])

# Find common features using set intersection
common_features = features_XGBoost & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

In [None]:
#Extract feature lists from DataFrames

features_rf = set(selected_features_rf)
features_df = set(selected_features_df['Feature'])

# Find common features using set intersection
common_features = features_rf & features_df

# Print common features
print("Common Features:")
for feature in sorted(common_features):
    print(feature)

# Count the number of common features
num_common_features = len(common_features)
print(f"\nNumber of common features: {num_common_features}")

##ROC-AUC Curve

In [None]:
#XGBOOST
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
# Optionally, you can also transform the training and test sets
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Train the final model with the selected features
final_model_xg.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred_prob = final_model_xg.predict_proba(X_test_selected)[:, 1]
y_pred = final_model_xg.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f'Test ROC AUC: {roc_auc:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')



In [None]:
#RANDOM FOREST
X_train_selected_rf = selector_rf.transform(X_train)
X_test_selected_rf = selector_rf.transform(X_test)

# Train the final Random Forest model with the selected features
final_model_rf.fit(X_train_selected_rf, y_train)

# Make predictions on the test set
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
y_pred_rf = final_model_rf.predict(X_test_selected_rf)

# Calculate evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

print(f'Random Forest Test ROC AUC: {roc_auc_rf:.4f}')
print(f'Random Forest Test Accuracy: {accuracy_rf:.4f}')
print(f'Random Forest Test Precision: {precision_rf:.4f}')
print(f'Random Forest Test Recall: {recall_rf:.4f}')
print(f'Random Forest Test F1-Score: {f1_rf:.4f}')


In [None]:
#LASSO
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_selected, y_train)

# Make predictions on the test set (predicted probabilities)
y_pred_proba = final_model_lasso.predict_proba(X_test_selected)[:, 1]  # Probability of the positive class

# Convert probabilities to predicted class labels
y_pred = final_model_lasso.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print(f'Test ROC AUC: {roc_auc:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1-Score: {f1:.4f}')




In [None]:
# Function to calculate ROC-AUC with 95% confidence intervals
def calc_auc_ci(y_true, y_pred_prob, n_bootstraps=1000, ci_level=0.95):
    bootstrapped_scores = []
    rng = np.random.RandomState(50)
    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_pred_prob), len(y_pred_prob))
        if len(np.unique(y_true[indices])) < 2:
            # We need at least one positive and one negative sample for ROC AUC
            continue
        score = roc_auc_score(y_true[indices], y_pred_prob[indices])
        bootstrapped_scores.append(score)
    sorted_scores = np.array(bootstrapped_scores)
    sorted_scores.sort()
    # Calculate the confidence interval
    ci_lower = sorted_scores[int((1.0 - ci_level) / 2.0 * len(sorted_scores))]
    ci_upper = sorted_scores[int((1.0 + ci_level) / 2.0 * len(sorted_scores))]
    return ci_lower, ci_upper

# Apply the same feature selector to both training and test sets
X_train_selected_xg = selector.transform(X_train)
X_test_selected_xg = selector.transform(X_test)
X_train_selected_rf = selector_rf.transform(X_train)
X_test_selected_rf = selector_rf.transform(X_test)
X_train_selected_lasso = X_train[selected_features]
X_test_selected_lasso = X_test[selected_features]

# Train and predict with XGBoost
final_model_xg.fit(X_train_selected_xg, y_train)
y_pred_prob_xg = final_model_xg.predict_proba(X_test_selected_xg)[:, 1]
fpr_xg, tpr_xg, _ = roc_curve(y_test, y_pred_prob_xg)
roc_auc_xg = auc(fpr_xg, tpr_xg)
ci_lower_xg, ci_upper_xg = calc_auc_ci(y_test, y_pred_prob_xg)

# Train and predict with Random Forest
final_model_rf.fit(X_train_selected_rf, y_train)
y_pred_prob_rf = final_model_rf.predict_proba(X_test_selected_rf)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
ci_lower_rf, ci_upper_rf = calc_auc_ci(y_test, y_pred_prob_rf)

# Train and predict with Lasso (Logistic Regression with L1 penalty)
final_model_lasso = LogisticRegression(penalty='l1', **best_params_lasso, random_state=42)
final_model_lasso.fit(X_train_selected_lasso, y_train)
y_pred_prob_lasso = final_model_lasso.predict_proba(X_test_selected_lasso)[:, 1]
fpr_lasso, tpr_lasso, _ = roc_curve(y_test, y_pred_prob_lasso)
roc_auc_lasso = auc(fpr_lasso, tpr_lasso)
ci_lower_lasso, ci_upper_lasso = calc_auc_ci(y_test, y_pred_prob_lasso)

# Plot ROC curve with confidence intervals
plt.figure(figsize=(10, 8))

# XGBoost
plt.plot(fpr_xg, tpr_xg, color='blue', lw=2, label=f'XGBoost (AUC = {roc_auc_xg:.4f} [{ci_lower_xg:.4f}-{ci_upper_xg:.4f}])')
plt.fill_between(fpr_xg, tpr_xg - (roc_auc_xg - ci_lower_xg), tpr_xg + (ci_upper_xg - roc_auc_xg), color='blue', alpha=0.2)

# Random Forest
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.4f} [{ci_lower_rf:.4f}-{ci_upper_rf:.4f}])')
plt.fill_between(fpr_rf, tpr_rf - (roc_auc_rf - ci_lower_rf), tpr_rf + (ci_upper_rf - roc_auc_rf), color='green', alpha=0.2)

# Lasso (Logistic Regression)
plt.plot(fpr_lasso, tpr_lasso, color='red', lw=2, label=f'Lasso (AUC = {roc_auc_lasso:.4f} [{ci_lower_lasso:.4f}-{ci_upper_lasso:.4f}])')
plt.fill_between(fpr_lasso, tpr_lasso - (roc_auc_lasso - ci_lower_lasso), tpr_lasso + (ci_upper_lasso - roc_auc_lasso), color='red', alpha=0.2)

# Plot the no skill line
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--', label='No Skill')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC Curve for GC Metabolome')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)

# Add the confidence intervals to the legend
plt.legend(loc="lower right")

# Save the plot before displaying it
plt.savefig('ROC-AUC_Curve_For_GC_Metabolome_final.png', dpi=600, bbox_inches='tight')

plt.show()

###KEEPING THE IMPORTANT FEATURES

In [None]:

# Assuming 'Group' is in the dataset, add it to the common features
common_features_with_group = list(common_features) + ['Group']

# Filter the dataset to keep only the common features and the 'Group' column
mtb_filtered = mtb[common_features_with_group]

# Save or display the filtered dataset
print(mtb_filtered.head())

# Optionally, save the filtered dataset to a new file
mtb_filtered.to_excel("Gastric Cancer Metabolites.xlsx", sep='\t', index=False)
