In [19]:
import matplotlib.pyplot as plt
from pycaret.datasets import get_data
from pycaret.regression import *
from pandas import DataFrame
from itertools import combinations
from math import comb

## An issue with time to run
- Takes ~35 minutes to run ${14 \choose 1} = 14$ times, getting only 3 models instead of all
  - ${35 \over 14} = 2.5$ minutes per run
- So, to run ${14 \choose 3} = 364$ times would take ${364 \times 2.5} = 910$ minutes
- If I want to include all cases, could take up to $5500$ minutes (running for all models once generally takes about 15 minutes)

In [21]:
df = get_data('data/initial_features')

df[['weight', 'pace']] = df['experimental_condition'].str.split('-', expand=True)

df['weight'] = df['weight'].str.replace('Condition ', '').astype(float)
df['pace'] = df['pace'].astype(int)

df['rpe'] = df['rpe'].fillna(method='ffill')
df['rpe'] = df['rpe'].astype(int)


COMBINATIONS = 1
combinations_list = list(combinations(df['subject'].unique(), COMBINATIONS))

# comb(14, 3)

In [25]:
models_df = DataFrame({
    "Model": [], "MAE": [], "MSE": [], "RMSE": [], "R2": [], "RMSLE": [], "MAPE": [], "TT (Sec)": [], "Training Subjects": []
})

In [26]:
for sub in combinations_list:
    train = df[~df['subject'].isin(sub)]
    test = df[df['subject'].isin(sub)]

    reg = setup(data=train, target='rpe')
    best = compare_models(sort='MAE', include=['rf', 'gbr', 'dt'])
    all = pull()

    # Stored in separate df?
    # pred = predict_model(best, data=test)

    for index, row in all.iterrows():
        models_df = models_df.append({
            "Model": row['Model'],
            "MAE": row['MAE'],
            "MSE": row['MSE'],
            "RMSE": row['RMSE'],
            "R2": row['R2'],
            "RMSLE": row['RMSLE'],
            "MAPE": row['MAPE'],
            "TT (Sec)": row['TT (Sec)'],
            "Training Subjects": sub
        }, ignore_index=True)

models_df
models_df.to_csv('outputs/models.csv', index=False)

# -= FOR MY COMPUTER =-
# Takes ~35 minutes to run 14 times (14 choose 1), getting only 3 models instead of all
#   --> 2.5 minutes per run
# So, to run 364 times (14 choose 3) would take 364*2.5=~910 minutes
# if want to include all cases, could take up to 5500 minutes (running for all models once generally takes about 15 minutes)

Unnamed: 0,Description,Value
0,Session id,1291
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2585, 61)"
4,Transformed data shape,"(2585, 64)"
5,Transformed train set shape,"(1809, 64)"
6,Transformed test set shape,"(776, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,7289
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2699, 61)"
4,Transformed data shape,"(2699, 64)"
5,Transformed train set shape,"(1889, 64)"
6,Transformed test set shape,"(810, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,7289
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2699, 61)"
4,Transformed data shape,"(2699, 64)"
5,Transformed train set shape,"(1889, 64)"
6,Transformed test set shape,"(810, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,7210
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2707, 61)"
4,Transformed data shape,"(2707, 64)"
5,Transformed train set shape,"(1894, 64)"
6,Transformed test set shape,"(813, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,7210
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2707, 61)"
4,Transformed data shape,"(2707, 64)"
5,Transformed train set shape,"(1894, 64)"
6,Transformed test set shape,"(813, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,8899
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2696, 61)"
4,Transformed data shape,"(2696, 64)"
5,Transformed train set shape,"(1887, 64)"
6,Transformed test set shape,"(809, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,8899
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2696, 61)"
4,Transformed data shape,"(2696, 64)"
5,Transformed train set shape,"(1887, 64)"
6,Transformed test set shape,"(809, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,1133
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2815, 61)"
4,Transformed data shape,"(2815, 64)"
5,Transformed train set shape,"(1970, 64)"
6,Transformed test set shape,"(845, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,4395
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2797, 61)"
4,Transformed data shape,"(2797, 64)"
5,Transformed train set shape,"(1957, 64)"
6,Transformed test set shape,"(840, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,2489
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2613, 61)"
4,Transformed data shape,"(2613, 64)"
5,Transformed train set shape,"(1829, 64)"
6,Transformed test set shape,"(784, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,2160
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2634, 61)"
4,Transformed data shape,"(2634, 64)"
5,Transformed train set shape,"(1843, 64)"
6,Transformed test set shape,"(791, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,8880
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2667, 61)"
4,Transformed data shape,"(2667, 64)"
5,Transformed train set shape,"(1866, 64)"
6,Transformed test set shape,"(801, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,3832
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2738, 61)"
4,Transformed data shape,"(2738, 64)"
5,Transformed train set shape,"(1916, 64)"
6,Transformed test set shape,"(822, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,3670
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2613, 61)"
4,Transformed data shape,"(2613, 64)"
5,Transformed train set shape,"(1829, 64)"
6,Transformed test set shape,"(784, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,7260
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2774, 61)"
4,Transformed data shape,"(2774, 64)"
5,Transformed train set shape,"(1941, 64)"
6,Transformed test set shape,"(833, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,7260
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2774, 61)"
4,Transformed data shape,"(2774, 64)"
5,Transformed train set shape,"(1941, 64)"
6,Transformed test set shape,"(833, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Description,Value
0,Session id,6971
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2682, 61)"
4,Transformed data shape,"(2682, 64)"
5,Transformed train set shape,"(1877, 64)"
6,Transformed test set shape,"(805, 64)"
7,Numeric features,59
8,Categorical features,1
9,Preprocess,True


Processing:   0%|          | 0/17 [00:00<?, ?it/s]

Processing:   0%|          | 0/17 [00:00<?, ?it/s]

In [6]:
train = df[~(df['subject'] == 1)]
test = df[df['subject'] == 1]

reg = setup(data=train, target='rpe')
best = compare_models(sort='MAE', include=['rf', 'gbr', 'dt'])
pull()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.8934,2.9732,1.709,0.5204,0.5184,0.3101,3.367
rf,Random Forest Regressor,0.9102,1.7,1.3006,0.7275,0.4388,0.2648,4.383
gbr,Gradient Boosting Regressor,1.2977,2.8578,1.6898,0.5389,0.5478,0.3999,3.775


In [17]:
pred = predict_model(best, data=test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,1.6702,5.2561,2.2926,-1.4014,0.7014,1.0052


In [15]:
models_df_new = df = get_data('outputs/models')
models_df_new

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),Training Subjects,Training Subjects.1
0,Random Forest Regressor,0.9646,1.8523,1.3551,0.7079,0.4587,0.2888,4.29,,"(1,)"
1,Decision Tree Regressor,0.9749,3.3344,1.8158,0.4682,0.5559,0.3363,3.633,,"(1,)"
2,Gradient Boosting Regressor,1.3166,2.9408,1.7077,0.5359,0.5521,0.4099,4.082,,"(1,)"
3,Random Forest Regressor,0.9839,1.9464,1.3923,0.6857,0.4701,0.3058,4.655,,"(2,)"
4,Decision Tree Regressor,1.1281,3.8628,1.9559,0.3698,0.6124,0.4164,3.821,,"(2,)"
5,Gradient Boosting Regressor,1.2233,2.6646,1.6298,0.5693,0.5383,0.3956,4.337,,"(2,)"
6,Random Forest Regressor,0.894,1.6578,1.2863,0.6904,0.4468,0.2712,4.641,,"(3,)"
7,Decision Tree Regressor,0.9434,3.073,1.741,0.4183,0.5606,0.3583,3.839,,"(3,)"
8,Gradient Boosting Regressor,1.2154,2.5923,1.6087,0.5164,0.5455,0.3869,4.115,,"(3,)"
9,Random Forest Regressor,0.9631,1.8916,1.37,0.6985,0.4624,0.291,4.683,,"(4,)"
