In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import loguniform
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [3]:
files = ['balanced_data/BK7610_balanced_250hz.parquet', 
         'balanced_data/BU4707_balanced_250hz.parquet', 
         'balanced_data/CC6740_balanced_250hz.parquet',
         'balanced_data/DC6359_balanced_250hz.parquet',
         'balanced_data/DK3500_balanced_250hz.parquet',
         'balanced_data/HV0618_balanced_250hz.parquet',
         'balanced_data/JB3156_balanced_250hz.parquet',
         'balanced_data/JR8022_balanced_250hz.parquet',
         'balanced_data/MC7070_balanced_250hz.parquet',
         'balanced_data/MJ8002_balanced_250hz.parquet',
         'balanced_data/PC6771_balanced_250hz.parquet',
         'balanced_data/SA0297_balanced_250hz.parquet',
         'balanced_data/SF3079_balanced_250hz.parquet'
         ]

In [4]:
feat_files = ['features/BK7610_features.parquet', 
         'features/BU4707_features.parquet', 
         'features/CC6740_features.parquet',
         'features/DC6359_features.parquet',
         'features/DK3500_features.parquet',
         'features/HV0618_features.parquet',
         'features/JB3156_features.parquet',
         'features/JR8022_features.parquet',
         'features/MC7070_features.parquet',
         'features/MJ8002_features.parquet',
         'features/PC6771_features.parquet',
         'features/SA0297_features.parquet',
         'features/SF3079_features.parquet'
         ]

In [5]:
pids = ['BK7610', 
         'BU4707', 
         'CC6740',
         'DC6359',
         'DK3500',
         'HV0618',
         'JB3156',
         'JR8022',
         'MC7070',
         'MJ8002',
         'PC6771',
         'SA0297',
         'SF3079'
         ]

In [16]:
def thirteen_fold_cross_validation(parquet_files):
    results = []
    
    for i, test_file in enumerate(parquet_files):
        test_data = pd.read_parquet(test_file)
        test_labels = test_data['TAC_class']
        test_data = test_data.drop(columns=['time', 'pid','TAC_class'])
        
        train_files = parquet_files[:i] + parquet_files[i+1:]
        train_data = pd.concat([pd.read_parquet(file) for file in train_files])
        train_labels = train_data['TAC_class']
        train_data = train_data.drop(columns=['time', 'pid','TAC_class'])       

        param_distributions = {
            "min_weight_fraction_leaf": [0.0, 0.5, 0.1, 0.2, 0.33],
            "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
            "learning_rate": [1, 0.5, 0.25, 0.1, 0.01],
        }
        search_cv = RandomizedSearchCV(
            GradientBoostingClassifier(),
            param_distributions=param_distributions,
            scoring="neg_mean_absolute_error",
            random_state=42,
            n_jobs=2,
        )
        search_cv.fit(train_data, train_labels)
        columns = [f"param_{name}" for name in param_distributions.keys()]
        columns += ["mean_test_error", "std_test_error"]
        cv_results = pd.DataFrame(search_cv.cv_results_)
        cv_results["mean_test_error"] = -cv_results["mean_test_score"]
        cv_results["std_test_error"] = cv_results["std_test_score"]
        cv_results[columns].sort_values(by="mean_test_error")
        results.append(cv_results)

        # rf_classifier.fit(train_data, train_labels)
        # y_pred = rf_classifier.predict(test_data)
        # accuracy = rf_classifier.score(test_data, test_labels)
        # accuracies.append(accuracy)
        # print(f"Participant {i+1}, Fold {len(accuracies)} Accuracy: {accuracy}")
        # conf_matrix = confusion_matrix(test_labels, y_pred)
        # conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

        # # Plot confusion matrix with percentages
        # data = pd.read_parquet(test_file)
        # plt.figure(figsize=(8, 6))
        # sns.heatmap(conf_matrix_norm, annot=True, fmt=".2f", cmap="Blues", cbar=False)
        # plt.title(f"Confusion Matrix {pids[i]} (Percentages)")
        # plt.xlabel("Predicted Class")
        # plt.ylabel("Actual Class")
        # plt.show()
    return results
    

In [17]:
thirteen_fold_cross_validation(feat_files)

KeyboardInterrupt: 