# Fold assignments

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def generate_nested_cv_folds(ids, labels, outer_n=5, inner_n=4, random_state=42, shuffle=True, verbose=True):
    """ 
    Generates nested cross-validation folds with ID-label pairs.
    """
    ids = np.array(ids)
    labels = np.array(labels)

    if verbose:
        print("Generating fold assignments...")
        print(f"=== Nested CV ({outer_n}-outer, {inner_n}-inner) ===")
        print(f"Random state: {random_state} | Shuffle: {shuffle}\n")
        print(f"HC: {(labels == 0).sum()}, PD: {(labels == 1).sum()}")

    folds = {}
    outer_cv = StratifiedKFold(n_splits=outer_n, shuffle=shuffle, random_state=random_state)

    for outer_idx, (train_idx, test_idx) in enumerate(outer_cv.split(ids, labels), 1):
        train_ids, test_ids = ids[train_idx], ids[test_idx]
        train_labels, test_labels = labels[train_idx], labels[test_idx]

        if verbose:
            print(f"Outer Fold {outer_idx}/{outer_n}")
            print(f"Train: {len(train_ids)} | Test: {len(test_ids)}\n")

        outer_fold = {
            "test": {id_: int(label) for id_, label in zip(test_ids, test_labels)},
            "inner": {}
        }

        inner_cv = StratifiedKFold(n_splits=inner_n, shuffle=shuffle, random_state=outer_idx)
        for inner_idx, (inner_train_idx, val_idx) in enumerate(inner_cv.split(train_ids, train_labels), 1):
            inner_train_ids = train_ids[inner_train_idx]
            inner_train_labels = train_labels[inner_train_idx]
            val_ids = train_ids[val_idx]
            val_labels = train_labels[val_idx]

            outer_fold["inner"][inner_idx] = {
                "train": {id_: int(label) for id_, label in zip(inner_train_ids, inner_train_labels)},
                "val": {id_: int(label) for id_, label in zip(val_ids, val_labels)}
            }

        folds[outer_idx] = outer_fold

    return folds

def generate_folds_from_classes(metadata_file, class_pair, output_file, exclude_id="C037_BFL"):
    # Load metadata and exclude unwanted ID
    metadata = pd.read_csv(metadata_file)
    metadata = metadata[metadata['ID'] != exclude_id]

    # Filter only the desired class2 values
    metadata = metadata[metadata['class2'].isin(class_pair)]

    # Map the classes to 0 and 1
    label_map = {class_pair[0]: 0, class_pair[1]: 1}
    metadata['class2'] = metadata['class2'].map(label_map)

    # Drop any rows with missing values
    metadata = metadata.dropna()
    metadata['class2'] = metadata['class2'].astype(int)

    # Get ids and labels
    ids = metadata['ID'].tolist()
    labels = metadata['class2'].tolist()

    # Generate nested folds
    fold_dict = generate_nested_cv_folds(ids, labels, outer_n=5, inner_n=4, random_state=42, shuffle=True)

    # Save to JSON
    with open(output_file, "w") as f:
        json.dump(fold_dict, f, indent=4)

    print(f"Saved folds to {output_file}")

# Settings
outer_n = 5
inner_n = 4
random_state = 42
shuffle = True

metadata = pd.read_csv('metadata.csv')
metadata = metadata[metadata['ID'] != 'C037_BFL']
ids = metadata['ID'].to_list()
labels = metadata['class'].to_list() # 0 = HC, 1 = PD

fold_dict = generate_nested_cv_folds(ids, labels, outer_n=5, inner_n=4, random_state=42, shuffle=True)

import json
with open("folds/PD_vs_HC_folds.json", "w") as f:
    json.dump(fold_dict, f, indent=4)

In [None]:
generate_folds_from_classes(
    metadata_file='metadata.csv',
    class_pair=['CTR_DCL', 'Park_DCL'],
    output_file='folds/CTR_DCL_vs_Park_DCL_folds.json'
)

generate_folds_from_classes(
    metadata_file='metadata.csv',
    class_pair=['CTR_noDCL', 'Park_noDCL'],
    output_file='folds/CTR_noDCL_vs_Park_noDCL_folds.json'
)

generate_folds_from_classes(
    metadata_file='metadata.csv',
    class_pair=['Park_noDCL', 'Park_DCL'],
    output_file='folds/Park_noDCL_vs_Park_DCL_folds.json'
)

# Running nested-CV

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc, recall_score, precision_score

def specificity_score(y_true, y_pred):
    """Calculate specificity (true negative rate)"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def train_inner_model(X_train, y_train, inner_folds, random_state):
    """Train model with inner CV grid search"""
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(probability=True, random_state=random_state))
    ])
    
    param_grid = {
        'svm__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'svm__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'svm__kernel': ['rbf']
    }
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=inner_folds,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def evaluate_model(model, X_test, y_test):
    """Evaluate model and return metrics"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'sensitivity': recall_score(y_test, y_pred),
        'specificity': specificity_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'y_true': y_test,
        'y_pred': y_pred,
        'y_proba': y_proba
    }

def plot_ROC_curve(all_y_true, all_y_proba, filename='roc_curve.png'):
    """Generate and save ROC curve"""

    fpr, tpr, _ = roc_curve(all_y_true, all_y_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(filename)
    plt.close()

def plot_confusion_matrix(all_y_true, all_y_pred, filename='confusion_matrix.png'):
    """Generate and save confusion matrix"""

    plt.figure(figsize=(6, 6))
    sns.heatmap(confusion_matrix(all_y_true, all_y_pred), 
                annot=True, fmt='d', cmap='Blues',
                xticklabels=['HC', 'PD'],
                yticklabels=['HC', 'PD'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(filename)
    plt.close()

def plot_probability_density(y_true, y_proba, filename='probability_density.png'):
    """
    Plot density distribution of predicted probabilities for each class
    
    Parameters:
    -----------
    y_true : array-like
        True labels (0=HC, 1=PD)
    y_proba : array-like
        Predicted probabilities for class 1 (PD)
    save_path : str
        Path to save the plot
    """
    # Create DataFrame for plotting
    df = pd.DataFrame({
        'Probability': y_proba,
        'Group': ['HC' if label == 0 else 'PD' for label in y_true]
    })
    
    # Define colors (Soft blue shades)
    blues = sns.color_palette("Blues", n_colors=6)
    colors = [blues[1], blues[4]]  # light blue for HC, dark blue for PD

    # Create figure with two subplots: Boxplot (top) and Density plot (bottom)
    fig, axes = plt.subplots(nrows=2, ncols=1, gridspec_kw={'height_ratios': [1, 4]}, figsize=(6, 6), sharex=True)

    # Boxplot (Top)
    sns.boxplot(x="Probability", y="Group", data=df, hue="Group", dodge=False, palette=colors, ax=axes[0], legend=False)
    axes[0].set_xlabel("")  # Remove xlabel to avoid redundancy
    axes[0].set_ylabel("")
    axes[0].set_yticks([])
    axes[0].set_title("Predicted Probability Distribution")

    # Histogram + KDE (Bottom)
    sns.histplot(data=df, x="Probability", hue="Group", kde=True, palette=colors, stat="density", bins=20, ax=axes[1], legend=True)
    
    axes[1].set_xlabel("Predicted Probability")
    legend_handles = [
        Patch(color=colors[0], label='HC'),
        Patch(color=colors[1], label='PD')
    ]
    axes[1].legend(handles=legend_handles)
    
    plt.tight_layout()
    plt.savefig(filename, dpi=300)  # Save the plot
    plt.close()

def run_nested_svm_cv(fold_dict, features_df, random_state=42, features='articulation'):
    """
        Perform nested CV SVM classification with predefined folds
        
        Parameters:
        -----------
        fold_dict : dict
            Output from generate_nested_cv_folds()
        features_df : pd.DataFrame
            DataFrame with features (index=filename, columns=features)
        random_state : int
            Random seed for reproducibility
            
        Returns:
        --------
        tuple: (results_df, all_y_true, all_y_pred, all_y_proba)
    """
    results = []
    all_y_true = []
    all_y_pred = []
    all_y_proba = []

    for outer_fold, fold_info in fold_dict.items():
        test_ids = [id_ + '_MOTOR_LECTURA.wav' for id_ in fold_info["test"].keys()]
        y_test = list(fold_info["test"].values())

        train_labels_dict = {}
        for inner in fold_info["inner"].values():
            train_labels_dict.update(inner["train"])
            train_labels_dict.update(inner["val"])
        train_ids = [id_ + '_MOTOR_LECTURA.wav' for id_ in train_labels_dict.keys()]
        y_train = list(train_labels_dict.values())

        X_train = features_df.loc[train_ids].values
        X_test = features_df.loc[test_ids].values

        inner_folds = []
        for inner_num, inner in fold_info["inner"].items():
            inner_train_ids = [id_ + '_MOTOR_LECTURA.wav' for id_ in inner["train"].keys()]
            val_ids = [id_ + '_MOTOR_LECTURA.wav' for id_ in inner["val"].keys()]
            train_idx = [i for i, id_ in enumerate(train_ids) if id_ in inner_train_ids]
            val_idx = [i for i, id_ in enumerate(train_ids) if id_ in val_ids]
            inner_folds.append((train_idx, val_idx))

        model = train_inner_model(X_train, y_train, inner_folds, random_state)
        metrics = evaluate_model(model, X_test, y_test)

        results.append({
            'outer_fold': outer_fold,
            **metrics,
            'best_params': model.get_params()['svm'],
            'train_files': train_ids,
            'test_files': test_ids
        })

        all_y_true.extend(metrics['y_true'])
        all_y_pred.extend(metrics['y_pred'])
        all_y_proba.extend(metrics['y_proba'])

        print(f"\nFold {outer_fold} Results:")
        print(f"Train set size: {len(y_train)}")
        print(f"Test set size: {len(y_test)}")
        print(f"Accuracy: {metrics['accuracy']:.3f}")
        print(f"Sensitivity: {metrics['sensitivity']:.3f}")
        print(f"Specificity: {metrics['specificity']:.3f}")

    plot_ROC_curve(all_y_true, all_y_proba, f'results/plots/roc_curve_{features}.png')
    plot_confusion_matrix(all_y_true, all_y_pred, f'results/plots/confusion_matrix_{features}.png')
    plot_probability_density(all_y_true, all_y_proba, f'results/plots/probability_density_{features}.png')

    return results, np.array(all_y_true), np.array(all_y_pred), np.array(all_y_proba)

# All features

In [None]:
feature_sets = {
    'articulation': ['articulationfeatures.csv'],
    'phonation': ['phonationfeatures.csv'],
    'phonological': ['phonologicalfeatures.csv'],
    'prosody': ['prosodyfeatures.csv'],
    'all': [
        'articulationfeatures.csv',
        'phonationfeatures.csv',
        'phonologicalfeatures.csv',
        'prosodyfeatures.csv'
    ]
}

with open("folds/PD_vs_HC_folds.json", "r") as f:
    fold_dict = json.load(f)

# Loop over each feature set
for feature_name, file_list in feature_sets.items():
    print(f"\n=== Running Nested CV for: {feature_name} ===")

    # Load and combine features
    features_df = pd.concat([
        pd.read_csv(f'features/{f}').set_index('id') for f in file_list
    ], axis=1)

    print(f"Feature shape: {features_df.shape}")

    # Run nested CV
    results, _, _, _ = run_nested_svm_cv(fold_dict, features_df, features=feature_name)

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'results/{feature_name}_results.csv', index=False)

    # Aggregate metrics
    final_metrics = {
        'mean_accuracy': np.mean([x['accuracy'] for x in results]),
        'std_accuracy': np.std([x['accuracy'] for x in results]),
        'mean_sensitivity': np.mean([x['sensitivity'] for x in results]),
        'std_sensitivity': np.std([x['sensitivity'] for x in results]),
        'mean_specificity': np.mean([x['specificity'] for x in results]),
        'std_specificity': np.std([x['specificity'] for x in results]),
        'mean_f1': np.mean([x['f1'] for x in results]),
        'std_f1': np.std([x['f1'] for x in results]),
    }

    # Print metrics
    print(f"\n=== Final Metrics {feature_name.title()} ===")
    print(f"Mean Accuracy: {100*final_metrics['mean_accuracy']:.1f} ± {100*final_metrics['std_accuracy']:.1f}")
    print(f"Mean Sensitivity: {100*final_metrics['mean_sensitivity']:.1f} ± {100*final_metrics['std_sensitivity']:.1f}")
    print(f"Mean Specificity: {100*final_metrics['mean_specificity']:.1f} ± {100*final_metrics['std_specificity']:.1f}")
    print(f"Mean f1: {100*final_metrics['mean_f1']:.1f} ± {100*final_metrics['std_f1']:.1f}")

In [20]:
feature_sets = {
    'articulation': ['articulationfeatures.csv'],
    'phonation': ['phonationfeatures.csv'],
    'phonological': ['phonologicalfeatures.csv'],
    'prosody': ['prosodyfeatures.csv'],
    'all': [
        'articulationfeatures.csv',
        'phonationfeatures.csv',
        'phonologicalfeatures.csv',
        'prosodyfeatures.csv'
    ]
}

with open("folds/CTR_noDCL_vs_Park_noDCL_folds.json", "r") as f:
    fold_dict = json.load(f)

# Loop over each feature set
for feature_name, file_list in feature_sets.items():
    print(f"\n=== Running Nested CV for: {feature_name} ===")

    # Load and combine features
    features_df = pd.concat([
        pd.read_csv(f'features/{f}').set_index('id') for f in file_list
    ], axis=1)

    # Run nested CV
    results, _, _, _ = run_nested_svm_cv(fold_dict, features_df, features=feature_name)

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'results/{feature_name}_results.csv', index=False)

    # Aggregate metrics
    final_metrics = {
        'mean_accuracy': np.mean([x['accuracy'] for x in results]),
        'std_accuracy': np.std([x['accuracy'] for x in results]),
        'mean_sensitivity': np.mean([x['sensitivity'] for x in results]),
        'std_sensitivity': np.std([x['sensitivity'] for x in results]),
        'mean_specificity': np.mean([x['specificity'] for x in results]),
        'std_specificity': np.std([x['specificity'] for x in results]),
        'mean_f1': np.mean([x['f1'] for x in results]),
        'std_f1': np.std([x['f1'] for x in results]),
    }

    # Print metrics
    print(f"\n=== Final Metrics {feature_name.title()} ===")
    print(f"Mean Accuracy: {100*final_metrics['mean_accuracy']:.1f} ± {100*final_metrics['std_accuracy']:.1f}")
    print(f"Mean Sensitivity: {100*final_metrics['mean_sensitivity']:.1f} ± {100*final_metrics['std_sensitivity']:.1f}")
    print(f"Mean Specificity: {100*final_metrics['mean_specificity']:.1f} ± {100*final_metrics['std_specificity']:.1f}")
    print(f"Mean f1: {100*final_metrics['mean_f1']:.1f} ± {100*final_metrics['std_f1']:.1f}")


=== Running Nested CV for: articulation ===
Fitting 4 folds for each of 49 candidates, totalling 196 fits

Fold 1 Results:
Train set size: 37
Test set size: 10
Accuracy: 0.500
Sensitivity: 1.000
Specificity: 0.000
Fitting 4 folds for each of 49 candidates, totalling 196 fits

Fold 2 Results:
Train set size: 37
Test set size: 10
Accuracy: 0.500
Sensitivity: 1.000
Specificity: 0.000
Fitting 4 folds for each of 49 candidates, totalling 196 fits

Fold 3 Results:
Train set size: 38
Test set size: 9
Accuracy: 0.444
Sensitivity: 1.000
Specificity: 0.000
Fitting 4 folds for each of 49 candidates, totalling 196 fits

Fold 4 Results:
Train set size: 38
Test set size: 9
Accuracy: 0.444
Sensitivity: 0.600
Specificity: 0.250
Fitting 4 folds for each of 49 candidates, totalling 196 fits

Fold 5 Results:
Train set size: 38
Test set size: 9
Accuracy: 0.667
Sensitivity: 0.600
Specificity: 0.750

=== Final Metrics Articulation ===
Mean Accuracy: 51.1 ± 8.2
Mean Sensitivity: 84.0 ± 19.6
Mean Specificity