# Classification of arrythmias

Explanation of using interpretable algorithms in healthcare to be able to detect features that explain why the arrythmia exists

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Load data
X = pd.read_csv('Train_Test_Data/X.csv', header=None)
X = pd.read_csv('Train_Test_Data/y.csv', header=None)

In [None]:
# Start with XGBOOST, then SVM wth a couple of kernels, then Naive Bayes, compare metrics, specially important false negatives and F1

# Drop the first column
X = X.iloc[:, 1:]

# Convert 'Gender' to binary (assuming 'male' is 1 and 'female' is 0)
X['Gender'] = X['Gender'].map({'MALE': 1, 'FEMALE': 0})

# Keep a copy of the column names
feature_names = X.columns

# Scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)

# Initialize classifiers
classifiers = {
    "SVM": SVC(probability=True),
    "XGBoost": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}


# Initialize StratifiedKFold for 10-fold cross-validation
skf = StratifiedKFold(n_splits=10)

results = {}

for name, clf in classifiers.items():
    accuracies = []
    f1_scores = []
    tps = []
    tns = []
    rocs = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Balance classes by undersampling the majority class in the training set
        num_samples = np.bincount(y_train).min()
        X_train_balanced, y_train_balanced = [], []

        for i in np.unique(y_train):
            idx = np.where(y_train == i)[0]
            np.random.shuffle(idx)
            X_train_balanced.append(pd.DataFrame(X_train).iloc[idx[:num_samples]])
            y_train_balanced.append(y_train[idx[:num_samples]])

        X_train_balanced = pd.concat(X_train_balanced)
        y_train_balanced = np.concatenate(y_train_balanced)

        # Train classifier
        clf.fit(X_train_balanced, y_train_balanced)

        # Make predictions (ignoring rows with NaN values in the test set)
        isnan_rows_test = np.any(np.isnan(X_test), axis=1)
        y_pred_proba = clf.predict_proba(X_test[~isnan_rows_test])
        y_pred = np.argmax(y_pred_proba, axis=1)

        # Calculate metrics
        accuracies.append(accuracy_score(y_test[~isnan_rows_test], y_pred))
        f1_scores.append(f1_score(y_test[~isnan_rows_test], y_pred, average='weighted'))
        
        cm = confusion_matrix(y_test[~isnan_rows_test], y_pred)
        tp_rate = np.diag(cm) / np.sum(cm, axis=1)
        tn_rate = (np.sum(cm) - np.sum(cm, axis=0) - np.sum(cm, axis=1) + np.diag(cm)) / (np.sum(cm) - np.sum(cm, axis=0))
        
        tps.append(np.mean(tp_rate))
        tns.append(np.mean(tn_rate))

        # Compute ROC curve for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        for i in range(len(np.unique(y))):
            fpr[i], tpr[i], _ = roc_curve(y_test[~isnan_rows_test], y_pred_proba[:, i], pos_label=i)
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        rocs.append(roc_auc)

    print(f"{name} Results:")
    print(f"Accuracy: {np.mean(accuracies)}")
    print(f"F1 Score: {np.mean(f1_scores)}")
    print(f"True Positives Rate: {np.mean(tps)}")
    print(f"True Negatives Rate: {np.mean(tns)}")

    # Feature importance (permutation importance used for SVM and Naive Bayes)
    perm_importance = permutation_importance(clf, X_test[~isnan_rows_test], y_test[~isnan_rows_test])
    importance = pd.DataFrame({'feature': feature_names, 'importance': perm_importance.importances_mean})
    print(importance.sort_values('importance', ascending=False))

    # Store results for plotting
    results[name] = {
        'Accuracy': np.mean(accuracies),
        'F1 Score': np.mean(f1_scores),
        'True Positives Rate': np.mean(tps),
        'True Negatives Rate': np.mean(tns),
        'ROC AUC': np.mean([roc_auc[i] for i in roc_auc])
    }

# Plotting metrics to compare models
metrics = ['Accuracy', 'F1 Score', 'True Positives Rate', 'True Negatives Rate', 'ROC AUC']
fig, axs = plt.subplots(3, 2, figsize=(10, 15))

for i, metric in enumerate(metrics):
    ax = axs[i//2, i%2]
    ax.bar(results.keys(), [results[name][metric] for name in results.keys()])
    ax.set_title(metric)

plt.tight_layout()
plt.show()