In [1]:
from pandas import read_csv as read
from pycaret.classification import *
from itertools import combinations
import csv
import pandas as pd

In [10]:
%%capture
df = read('../data/initial_features_classification.csv')

# The number of subjects used in testing data. Should be set to 3 to match the data used in results.
COMBINATIONS = 3

combinations_list = list(combinations(df['subject'].unique(), COMBINATIONS))

In [9]:
with open(f'outputs/train_general.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Model", "MAE", "MSE", "RMSE", "R2", "RMSLE", "MAPE", "TT (Sec)", "Training Subjects"])

with open(f'outputs/test_general.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Model", "MAE", "MSE", "RMSE", "R2", "RMSLE", "MAPE", "Training Subjects"])

with open(f'outputs/feature_importance_general.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Feature", "Value", "Model", "Training Subjects"])

for sub in combinations_list:  # [(2, 3, 6)]
    # Split data into training and testing based on subject
    train = df[~df['subject'].isin(sub)]
    test = df[df['subject'].isin(sub)]

    # IMPORTANT: CURRENTLY EXCLUDING wrist_acc_time. REMOVE IF NECESSARY.
    clf = setup(data=train, target='rpe', ignore_features=['experimental_condition', 'subject', 'wrist_acc_time'], verbose=False)
    best = compare_models(sort='Accuracy', n_select = 18)
    all = pull()
    all['test_set'] = str(sub)

    # Output trained model results to csv
    all.to_csv(f'outputs/train_general.csv', mode='a', header=False, index=False)
    
    
    test_results = pd.DataFrame()
    for model in best:
        # Run models on test data
        test_result = predict_model(model, verbose = False)
        test_result = pull()
        test_results = test_results.append(test_result)
        
        # Add feature importance of model to dataframe
        try:
            importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(model.feature_importances_)}).sort_values(by='Value', ascending=False).reset_index().drop('index', axis=1)
        except:
            try:
                importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(model.coef_)}).sort_values(by='Value', ascending=False).reset_index().drop('index', axis=1)
            except:
                importance = pd.DataFrame({'Feature': ['error'], 'Value': [0]})

        # Export to csv
        importance['Model'] = str(model)
        importance['test_set'] = str(sub)
        importance.to_csv(f'outputs/feature_importance_general.csv', mode='a', header=False, index=False)

    # Save test model results to csv
    test_results['test_set'] = str(sub)
    test_results.to_csv(f'outputs/test_general.csv', mode='a', header=False, index=False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6936,0.7067,0.75,0.7414,0.731,0.37,0.3919,0.358
lda,Linear Discriminant Analysis,0.6818,0.7478,0.68,0.7725,0.6928,0.3562,0.3875,0.019
catboost,CatBoost Classifier,0.6745,0.757,0.7367,0.6873,0.6921,0.3058,0.3285,0.863
rf,Random Forest Classifier,0.6645,0.7547,0.7367,0.6802,0.6902,0.2903,0.3342,0.059
et,Extra Trees Classifier,0.6636,0.757,0.7433,0.6939,0.6995,0.2852,0.3073,0.067
ridge,Ridge Classifier,0.6536,0.0,0.73,0.7056,0.6898,0.2838,0.3333,0.009
xgboost,Extreme Gradient Boosting,0.6445,0.6593,0.6967,0.6439,0.6493,0.2558,0.2739,0.062
gbc,Gradient Boosting Classifier,0.6436,0.6835,0.71,0.6896,0.6801,0.254,0.2788,0.06
nb,Naive Bayes,0.6155,0.6098,0.6467,0.6358,0.6304,0.2043,0.2185,0.011
ada,Ada Boost Classifier,0.6055,0.603,0.6333,0.5987,0.6131,0.182,0.1748,0.028


Processing:   0%|          | 0/84 [00:00<?, ?it/s]