In [1]:
from pandas import read_csv as read
from pycaret.classification import *
from itertools import combinations
import csv
import pandas as pd

In [2]:
%%capture
df = read('../data/initial_features_classification.csv')

# The number of subjects used in testing data. Should be set to 3 to match the data used in results.
COMBINATIONS = 3

combinations_list = list(combinations(df['subject'].unique(), COMBINATIONS))

In [5]:
train_general_df = pd.DataFrame()
test_general_df = pd.DataFrame()
feature_importance_general_df = pd.DataFrame()

for sub in combinations_list:  # EX: [(2, 3, 6)]
    # Split data into training and testing based on subject
    train = df[~df['subject'].isin(sub)]
    test = df[df['subject'].isin(sub)]

    clf = setup(data=train, target='rpe', ignore_features=['experimental_condition', 'subject', 'wrist_acc_time'], verbose=False)
    best = compare_models(sort='Accuracy', n_select=18)
    all = pull()
    all['test_set'] = str(sub)

    # Append trained model results to dataframe
    train_general_df = pd.concat([train_general_df, all], ignore_index=True)
    
    test_results = pd.DataFrame()
    for model in best:
        # Run models on test data
        test_result = predict_model(model, data=test, verbose=False)
        test_result_df = pull()
        test_result_df['Model'] = str(model).split('(')[0]
        test_result_df['test_set'] = str(sub)
        test_results = pd.concat([test_results, test_result_df], ignore_index=True)
        
        # Add feature importance of model to dataframe
        try:
            importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value': abs(model.feature_importances_)}).sort_values(by='Value', ascending=False).reset_index(drop=True)
        except:
            try:
                importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value': abs(model.coef_)}).sort_values(by='Value', ascending=False).reset_index(drop=True)
            except:
                importance = pd.DataFrame({'Feature': ['error'], 'Value': [0]})

        # Append feature importance to dataframe
        importance['Model'] = str(model).split('(')[0]
        importance['test_subjects'] = str(sub)
        feature_importance_general_df = pd.concat([feature_importance_general_df, importance], ignore_index=True)

    # Append test model results to dataframe
    test_general_df = pd.concat([test_general_df, test_results], ignore_index=True)

# Save dataframes to csv
train_general_df.to_csv(f'outputs/train_general.csv', index=False)
test_general_df.to_csv(f'outputs/test_general.csv', index=False)
feature_importance_general_df.to_csv(f'outputs/feature_importance_general.csv', index=False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7527,0.8088,0.7667,0.827,0.7581,0.5176,0.5651,0.114
catboost,CatBoost Classifier,0.7109,0.7833,0.7933,0.7275,0.7454,0.4124,0.4424,0.929
rf,Random Forest Classifier,0.6727,0.7882,0.78,0.6858,0.716,0.3304,0.3653,0.122
gbc,Gradient Boosting Classifier,0.6727,0.7325,0.7733,0.6876,0.7084,0.3291,0.3669,0.112
lda,Linear Discriminant Analysis,0.6645,0.7763,0.7233,0.7012,0.6931,0.3181,0.3445,0.081
ada,Ada Boost Classifier,0.6436,0.694,0.6633,0.6869,0.6649,0.2753,0.2861,0.093
dt,Decision Tree Classifier,0.5973,0.6008,0.5767,0.6864,0.6131,0.1969,0.2129,0.076
qda,Quadratic Discriminant Analysis,0.5936,0.5608,0.9167,0.5984,0.715,0.1209,0.1459,0.079
lr,Logistic Regression,0.5827,0.5835,0.6967,0.6088,0.6419,0.1494,0.1564,0.083
lightgbm,Light Gradient Boosting Machine,0.5736,0.5643,0.6433,0.5806,0.5985,0.1411,0.161,0.132


Processing:   0%|          | 0/84 [00:00<?, ?it/s]