In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, roc_curve, auc,
                             accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, make_scorer, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt

# Ensure the figures directory exists
if not os.path.exists('figures'):
    os.makedirs('figures')

# Function to plot confusion matrix
def plot_cm(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.savefig(f'figures/{model_name}_confusion_matrix.png')
    plt.close()

# Function to plot ROC curve for multiclass classification
def plot_roc_multiclass(y_true, y_pred_proba, model_name, classes):
    y_true_bin = label_binarize(y_true, classes=classes)
    n_classes = y_true_bin.shape[1]
    fpr, tpr, roc_auc = {}, {}, {}
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (area = {roc_auc[i]:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(f'figures/{model_name}_roc_curve.png')
    plt.close()

# Load the TSV file into a pandas dataframe
data = pd.read_csv('leaveoneout_encoded.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define metrics and model
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted',
    'mcc': make_scorer(matthews_corrcoef)
}
model = RandomForestClassifier(random_state=42)

# Prepare storage for aggregated results
all_y_true = np.array([])
all_y_pred = np.array([])
all_y_pred_proba = np.empty((0, len(np.unique(y))))

# Perform cross-validation
results = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Store results for aggregate confusion matrix and ROC
    all_y_true = np.append(all_y_true, y_test)
    all_y_pred = np.append(all_y_pred, y_pred)
    all_y_pred_proba = np.vstack([all_y_pred_proba, y_pred_proba])

    results.append({
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted'),
        'MCC': matthews_corrcoef(y_test, y_pred)
    })

# Plot aggregated confusion matrix and ROC for all folds
plot_cm(all_y_true, all_y_pred, 'RandomForestClassifier_5-fold')
# plot_roc_multiclass(all_y_true, all_y_pred_proba, 'RandomForestClassifier_5-fold', classes=np.unique(y))

# Calculate average metrics across all folds
average_results = pd.DataFrame(results).mean()
average_results['Model'] = 'RandomForestClassifier_5-fold'
print(average_results)




Accuracy                          0.975524
Precision                         0.979714
Recall                            0.975524
F1 Score                          0.974955
MCC                               0.962886
Model        RandomForestClassifier_5-fold
dtype: object


In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (confusion_matrix, roc_curve, auc,
                             accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, make_scorer, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt

# Ensure the figures directory exists
if not os.path.exists('figures'):
    os.makedirs('figures')

# Function to plot confusion matrix with a different color map for DecisionTreeClassifier
def plot_cm(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    
    # Use a different color map for DecisionTreeClassifier
    cmap = plt.cm.Oranges if 'DecisionTreeClassifier' in model_name else plt.cm.Blues
    
    disp.plot(cmap=cmap)
    plt.title(f'{model_name} Confusion Matrix')
    plt.savefig(f'figures/{model_name}_confusion_matrix.png')
    plt.close()

# Function to plot ROC curve for multiclass classification
def plot_roc_multiclass(y_true, y_pred_proba, model_name, classes):
    y_true_bin = label_binarize(y_true, classes=classes)
    n_classes = y_true_bin.shape[1]
    fpr, tpr, roc_auc = {}, {}, {}
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (area = {roc_auc[i]:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(f'figures/{model_name}_roc_curve.png')
    plt.close()

# Load the TSV file into a pandas dataframe
data = pd.read_csv('leaveoneout_encoded.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define metrics and model
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted',
    'mcc': make_scorer(matthews_corrcoef)
}
model = DecisionTreeClassifier(random_state=42)

# Prepare storage for aggregated results
all_y_true = np.array([])
all_y_pred = np.array([])
all_y_pred_proba = np.empty((0, len(np.unique(y))))

# Perform cross-validation
results = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Store results for aggregate confusion matrix and ROC
    all_y_true = np.append(all_y_true, y_test)
    all_y_pred = np.append(all_y_pred, y_pred)
    all_y_pred_proba = np.vstack([all_y_pred_proba, y_pred_proba])

    results.append({
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted'),
        'MCC': matthews_corrcoef(y_test, y_pred)
    })

# Plot aggregated confusion matrix and ROC for all folds
plot_cm(all_y_true, all_y_pred, 'DecisionTreeClassifier_5-fold')
# plot_roc_multiclass(all_y_true, all_y_pred_proba, 'DecisionTreeClassifier_5-fold', classes=np.unique(y))

# Calculate average metrics across all folds
average_results = pd.DataFrame(results).mean()
average_results['Model'] = 'DecisionTreeClassifier_5-fold'
print(average_results)




Accuracy                          0.972354
Precision                         0.974588
Recall                            0.972354
F1 Score                          0.971818
MCC                               0.958108
Model        DecisionTreeClassifier_5-fold
dtype: object


In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (confusion_matrix, roc_curve, auc,
                             accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, make_scorer, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt

# Ensure the figures directory exists
if not os.path.exists('figures'):
    os.makedirs('figures')

# Function to plot confusion matrix with specific color maps for different classifiers
def plot_cm(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    
    # Define color maps for different classifiers
    if 'AdaBoostClassifier' in model_name:
        cmap = plt.cm.Greens
    elif 'DecisionTreeClassifier' in model_name:
        cmap = plt.cm.Oranges
    elif 'RandomForestClassifier' in model_name:
        cmap = plt.cm.Blues
    else:
        cmap = plt.cm.Blues  # Default cmap

    disp.plot(cmap=cmap)
    plt.title(f'{model_name} Confusion Matrix')
    plt.savefig(f'figures/{model_name}_confusion_matrix.png')
    plt.close()

# Function to plot ROC curve for multiclass classification
def plot_roc_multiclass(y_true, y_pred_proba, model_name, classes):
    y_true_bin = label_binarize(y_true, classes=classes)
    n_classes = y_true_bin.shape[1]
    fpr, tpr, roc_auc = {}, {}, {}
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (area = {roc_auc[i]:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(f'figures/{model_name}_roc_curve.png')
    plt.close()

# Load the TSV file into a pandas dataframe
data = pd.read_csv('leaveoneout_encoded.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define metrics and model
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted',
    'mcc': make_scorer(matthews_corrcoef)
}
model = AdaBoostClassifier(algorithm="SAMME", random_state=42)

# Prepare storage for aggregated results
all_y_true = np.array([])
all_y_pred = np.array([])
all_y_pred_proba = np.empty((0, len(np.unique(y))))

# Perform cross-validation
results = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Store results for aggregate confusion matrix and ROC
    all_y_true = np.append(all_y_true, y_test)
    all_y_pred = np.append(all_y_pred, y_pred)
    all_y_pred_proba = np.vstack([all_y_pred_proba, y_pred_proba])

    results.append({
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted'),
        'MCC': matthews_corrcoef(y_test, y_pred)
    })

# Plot aggregated confusion matrix and ROC for all folds
plot_cm(all_y_true, all_y_pred, 'AdaBoostClassifier_5-fold')
# plot_roc_multiclass(all_y_true, all_y_pred_proba, 'AdaBoostClassifier_5-fold', classes=np.unique(y))

# Calculate average metrics across all folds
average_results = pd.DataFrame(results).mean()
average_results['Model'] = 'AdaBoostClassifier_5-fold'
print(average_results)




Accuracy                      0.954033
Precision                     0.958871
Recall                        0.954033
F1 Score                      0.953634
MCC                           0.930072
Model        AdaBoostClassifier_5-fold
dtype: object
