In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, confusion_matrix,
    matthews_corrcoef, f1_score, roc_curve, auc
)
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import joblib
from collections import Counter

# Load Dataset
file_path = 'Mtb_positive_negative_data.csv'
data = pd.read_csv(file_path)

# K-mer Feature Extraction
def get_kmers(seq, size=6):
    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]

data['words'] = data['sequence'].apply(lambda x: get_kmers(x))
data['joined'] = data['words'].apply(lambda x: ' '.join(x))

X = data['joined'].values
y = data['class'].values

# Vectorize Sequences 
vectorizer = CountVectorizer(ngram_range=(6, 6))
X_vect = vectorizer.fit_transform(X)

# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_vect, y, test_size=0.2, random_state=42, stratify=y
)

# Classifier Setup 
rf = RandomForestClassifier(n_estimators=500, max_depth=10, class_weight='balanced', random_state=42)
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=5, random_state=42)

voting_clf = VotingClassifier(estimators=[('rf', rf), ('gb', gb)], voting='soft')

# Cross-Validation (dynamic folds)
min_class_count = min(Counter(y_train).values())
cv_folds = min(10, min_class_count)

cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
y_cv_pred = cross_val_predict(voting_clf, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
y_cv_labels = (y_cv_pred >= 0.5).astype(int)

# Performance Metrics
def evaluate_model(y_true, y_pred, y_proba=None, label=''):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0

    print(f"\n=== {label} Metrics ===")
    print(f"Accuracy: {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Sensitivity: {rec:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"Matthews Corr. Coefficient: {mcc:.3f}")
    print("Confusion Matrix:")
    print(cm)

    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        roc_auc = auc(fpr, tpr)
        print(f"AUC-ROC: {roc_auc:.3f}")

        plt.figure(figsize=(7, 6))
        plt.plot(fpr, tpr, color='blue', lw=2, label=f'Hybrid Algorithm ROC Curve (AUC = {roc_auc:.3f})')
        plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guess')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.title('ROC Curve (Hybrid Algorithm)')
        plt.legend(loc="lower right")
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.tight_layout()
        plt.show()


# Cross-Validation Results 
evaluate_model(y_train, y_cv_labels, y_cv_pred, label='Cross-Validation')

# Train on Full Training Set, Evaluate on Test Set 
voting_clf.fit(X_train, y_train)
y_test_pred = voting_clf.predict(X_test)
y_test_proba = voting_clf.predict_proba(X_test)[:, 1]

evaluate_model(y_test, y_test_pred, y_test_proba, label='Test Set')
