In [None]:
import numpy as np
import pandas as pd
import joblib

In [6]:
df_reduced = pd.read_csv('../data/final_data.csv', index_col=0)

In [8]:
df_reduced.head()

Unnamed: 0,subject_id,window_start,window_end,label,back_x_mean,back_x_std,back_x_skew,back_x_kurt,back_x_max,back_x_zero_crossings,...,thigh_y_skew,thigh_y_kurt,thigh_y_rms,thigh_y_zero_crossings,thigh_z_mean,thigh_z_skew,thigh_z_kurt,thigh_z_max,thigh_z_rms,thigh_z_zero_crossings
0,1,0,99,walking,-0.975977,0.137209,0.553341,16.817742,-0.355071,0,...,-4.686721,32.951507,0.130952,19,-0.272417,4.569072,30.871566,0.709439,0.304457,1
1,1,50,149,walking,-0.989042,0.006359,-0.09984,-0.245041,-0.976182,0,...,0.987363,0.877986,0.057976,0,-0.296721,-0.098355,-0.948189,-0.278594,0.296813,0
2,1,100,199,walking,-0.989415,0.008734,-0.688959,1.025675,-0.968303,0,...,2.36433,5.971838,0.069254,0,-0.300218,-1.503124,3.112805,-0.277551,0.300466,0
3,1,150,249,walking,-1.001666,0.089329,-1.872917,6.356036,-0.802925,0,...,1.249045,5.896466,0.162145,4,-0.296553,0.976161,6.404589,0.553839,0.346667,4
4,1,200,299,inactive,-0.994725,0.09355,-1.814825,5.336248,-0.802925,0,...,0.589514,2.207475,0.18316,14,-0.235579,0.067203,1.885965,0.553839,0.315772,10


## Pipeline

In [9]:
df_reduced.columns[4:]

Index(['back_x_mean', 'back_x_std', 'back_x_skew', 'back_x_kurt', 'back_x_max',
       'back_x_zero_crossings', 'back_y_mean', 'back_y_std', 'back_y_skew',
       'back_y_kurt', 'back_y_max', 'back_y_min', 'back_y_rms',
       'back_y_zero_crossings', 'back_z_mean', 'back_z_skew', 'back_z_kurt',
       'back_z_max', 'back_z_min', 'back_z_rms', 'back_z_zero_crossings',
       'thigh_x_mean', 'thigh_x_skew', 'thigh_x_kurt', 'thigh_x_max',
       'thigh_x_zero_crossings', 'thigh_y_mean', 'thigh_y_skew',
       'thigh_y_kurt', 'thigh_y_rms', 'thigh_y_zero_crossings', 'thigh_z_mean',
       'thigh_z_skew', 'thigh_z_kurt', 'thigh_z_max', 'thigh_z_rms',
       'thigh_z_zero_crossings'],
      dtype='object')

In [10]:
FEATURES = df_reduced.columns[4:]
X = df_reduced[FEATURES]
y = df_reduced['label']
groups = np.array(df_reduced['subject_id'])

In [11]:
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def group_ml_pipe(X, y, groups, preprocessor, model, param_grid, scoring_metric, num_trials=10, base_random_state=999):
    
    """A function to assemble performance of multiple model runs and optimize
    hyperparameters through GridSearchCV with group structure considerations.
    
    Parameters
    ----------
    X : pd.DataFrame
        Feature matrix
    y : pd.Series
        Target variable
    groups : pd.Series or np.ndarray
        Group labels for the samples
    preprocessor : Column transformer object or None
        Defines preprocessing on each feature; pass None if not required
    model : Initialized model
        Machine learning model to be optimized
    param_grid : dict
        Hyperparameters for the model to be optimized through GridSearchCV
    scoring_metric : Scikit-learn scoring function or string
        Scoring strategy for GridSearchCV
    num_trials : int
        Number of iterations to run through to pick optimized parameters
    base_random_state : int
        Base random seed for reproducibility

    Returns
    -------
    trial_results : list of dicts
        Salient information from each GridSearchCV run
    """
    
    trial_results = []
    
    for i in range(1, num_trials + 1):
        print(f'Running trial {i}')
        random_state = base_random_state * i
        
        current_trial_info = {
            'trial_num': i,
            'random_state': random_state
        }
        
        # Separate out the test set
        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
        other_index, test_index = next(gss.split(X, y, groups=groups))

        X_other, y_other, groups_other = X.iloc[other_index], y.iloc[other_index], groups[other_index]
        X_test, y_test, groups_test = X.iloc[test_index], y.iloc[test_index], groups[test_index]

        sgkf = StratifiedGroupKFold(n_splits=5)

        if preprocessor is not None:
            pipe = make_pipeline(preprocessor, model)
        else:
            pipe = make_pipeline(StandardScaler(), model)  # Default scaler if no preprocessor
        
        # GridSearchCV for hyperparameter optimization
        grid = GridSearchCV(pipe, param_grid=param_grid, scoring=scoring_metric,
                            cv=sgkf, return_train_score=True, verbose=True, n_jobs=-1)
        grid.fit(X_other, y_other, groups=groups_other)
        
        if grid.scorer_.__dict__['_sign'] < 0:
            best_test_score = abs(grid.score(X_test, y_test))
            maximized = False
        else:
            best_test_score = grid.score(X_test, y_test)
            maximized = True
        
        current_trial_info['grid'] = grid
        current_trial_info['best_test_score'] = {'score': best_test_score, 'maximized': maximized}
        current_trial_info['best_params'] = grid.best_params_
        current_trial_info['X_test'] = X_test
        current_trial_info['y_test'] = y_test
        current_trial_info['y_test_pred'] = grid.predict(X_test)
        current_trial_info['cv_results'] = grid.cv_results_
        
        trial_results.append(current_trial_info)
        print(f'Completed trial {i}')
        
    return trial_results

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

hyperparameters = {
    'randomforestclassifier__max_depth': [1,3,10,30,100],
    # 'randomforestclassifier__max_samples': [0.25,0.5,0.75],
    'randomforestclassifier__max_features': [0.25,0.5,0.75]
} 

results_rfc = group_ml_pipe(X, y, groups, preprocessor=None, model=RandomForestClassifier(class_weight='balanced'), 
                        param_grid=hyperparameters, scoring_metric='f1_macro', 
                        num_trials=5)

Running trial 1
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Completed trial 1
Running trial 2
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Completed trial 2
Running trial 3
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Completed trial 3
Running trial 4
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Completed trial 4
Running trial 5
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Completed trial 5


In [None]:
joblib.dump(results_rfc, '../results/rfc_results.pkl')

['results/rfc_results_classweighted.pkl']

#### XGBoost

In [None]:
# your code here 
import xgboost
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.utils import class_weight

def xgb_classifier(X, y_cat, groups, preprocessor, param_grid, n_random_states=5):
    output = {}
    output['X_test'] = []
    output['X_test_transformed'] = []
    output['y_test_true'] = []
    output['y_test_pred'] = []
    output['test_scores_acc'] = []
    output['test_scores_macro_f1'] = []
    output['best_models'] = []

    # Explicitly encode y
    label_encoder = LabelEncoder()
    y = pd.Series(label_encoder.fit_transform(y_cat))

    output['y_cat'] = y_cat
    output['y'] = y
    output['label_encoder'] = label_encoder

    pg = ParameterGrid(param_grid)

    trial_results = []

    for state in range(n_random_states):
        print(f"\n===== Random State: {state + 1} =====")

        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=state)
        for train_val_idx, test_idx in gss.split(X, y, groups):
            # print('TRAIN_VAL:', len(train_val_idx), 'TEST:', len(test_idx))
            
            X_train_val = X.iloc[train_val_idx]
            y_train_val = y.iloc[train_val_idx]
            groups_train_val = groups[train_val_idx]

            X_test = X.iloc[test_idx]
            y_test = y.iloc[test_idx]

        test_scores = []
        scores = np.zeros((len(pg),5))

        for i, params in enumerate(pg):
            print(f"------- hyperparameter combination {i} -------")
            sgkf = StratifiedGroupKFold(n_splits=5)
            for j, (train_idx, val_idx) in enumerate(sgkf.split(X_train_val, y_train_val, groups_train_val)):
                print(f"Fold {j + 1}")

                X_train, y_train = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
                X_CV, y_CV = X_train_val.iloc[val_idx], y_train_val.iloc[val_idx]

                # Preprocess the data
                X_train_transformed = preprocessor.fit_transform(X_train)
                X_CV_transformed = preprocessor.transform(X_CV)
                X_test_transformed = preprocessor.transform(X_test)

                # classes_weights = class_weight.compute_sample_weight(
                #     class_weight='balanced',
                #     y=y_train
                # )

                model = xgboost.XGBClassifier(**params, n_jobs=-1, objective='multi:softprob', early_stopping_rounds=50)
                model.fit(
                    X_train_transformed, y_train,
                    eval_set=[(X_CV_transformed, y_CV)],
                    verbose=False,
                    # sample_weight=classes_weights
                )
                y_CV_pred = model.predict(X_CV_transformed)
                scores[i][j] = f1_score(y_CV, y_CV_pred, average='macro')

        scores = np.mean(scores, axis=1)
        best_params = np.array(pg)[scores == np.max(scores)]

        print('Val set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

        # Train the model with best parameters and test on the test set
        best_model = xgboost.XGBClassifier(**best_params[0], n_jobs=-1, early_stopping_rounds=50)
        best_model.fit(
            X_train_transformed, y_train,
            eval_set=[(X_CV_transformed, y_CV)],
            verbose=False
        )
        output['best_models'].append(best_model)
        y_test_pred_best = best_model.predict(X_test_transformed)
        output['X_test'].append(X_test)
        output['X_test_transformed'].append(X_test_transformed)
        output['y_test_true'].append(pd.Series(y_test))
        output['y_test_pred'].append(pd.Series(y_test_pred_best))
        test_score = f1_score(y_test, y_test_pred_best, average='macro')
        output['test_scores_macro_f1'].append(test_score)
        output['test_scores_acc'].append(accuracy_score(y_test, y_test_pred_best))

    print('===============================================================================================')
    print(f"mean test macro f1: {np.mean(output['test_scores_macro_f1'])*100:.2f}% +/- {np.std(output['test_scores_macro_f1'])*100:.2f}%")
    print('===============================================================================================')

    return output

In [92]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)  # Replace `UserWarning` with the specific warning type

hyperparameters = {"learning_rate": [0.03],
                "n_estimators": [1,30,100,300,1000],
                "seed": [0],
                # "clf_alpha": [0e0, 1e-2, 1e-1, 1e0],
                # "clf_lambda": [0e0, 1e-2, 1e-1, 1e0],
                "missing": [np.nan], 
                "max_depth": [1,3,10,30,100],
                "colsample_bytree": [0.9],              
                "subsample": [0.66]}

# Update pipeline call with the encoded target
xgb_results = xgb_classifier(X=X, y_cat=y, groups=groups, preprocessor=StandardScaler(), 
                        param_grid=hyperparameters, n_random_states=5)


===== Random State: 1 =====
------- hyperparameter combination 0 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 1 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 2 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 3 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 4 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 5 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 6 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 7 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 8 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 9 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 10 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- hyperparameter combination 11 -------
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
------- h

In [53]:
xgb_results['test_scores']

[0.8472201889468266,
 0.8506257347560007,
 0.8533683680782541,
 0.8297176307535233,
 0.8654741748746682]

In [93]:
# should be the same as previous cell - CHECK

xgb_results['test_scores_macro_f1']

[0.8472201889468266,
 0.8506257347560007,
 0.8533683680782541,
 0.8297176307535233,
 0.8654741748746682]

In [None]:
joblib.dump(xgb_results, '../results/final/xgb_results.pkl')

#### Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

alpha_arr = np.logspace(-5,5,11,base=10)

hyperparameters = {
    'logisticregression__C': 1/alpha_arr,
    'logisticregression__penalty':['l2'],
    'logisticregression__max_iter':[10000],
    'logisticregression__multi_class':['multinomial'],
    'logisticregression__solver':['lbfgs']
}

results_lr = group_ml_pipe(X, y, groups, preprocessor=None, model=LogisticRegression(class_weight='balanced'), 
                        param_grid=hyperparameters, scoring_metric='f1_macro', 
                        num_trials=5)

Running trial 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Completed trial 1
Running trial 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Completed trial 2
Running trial 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Completed trial 3
Running trial 4
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Completed trial 4
Running trial 5
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Completed trial 5


In [None]:
joblib.dump(results_lr, '../results/logreg_results.pkl')

['results/logreg_results_classweighted.pkl']

#### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

hyperparameters = { 'kneighborsclassifier__n_neighbors' : [1,30,100,300,1000],
               'kneighborsclassifier__weights' : ['uniform','distance'],
               'kneighborsclassifier__metric' : ['minkowski','euclidean','manhattan']}

results_knn = group_ml_pipe(X, y, groups, preprocessor=None, model=KNeighborsClassifier(), 
                        param_grid=hyperparameters, scoring_metric='f1_macro', 
                        num_trials=5)

Running trial 1
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Completed trial 1
Running trial 2
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Completed trial 2
Running trial 3
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Completed trial 3
Running trial 4
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Completed trial 4
Running trial 5
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Completed trial 5


In [None]:
joblib.dump(results_knn, '../results/knn_results.pkl')

['results/knn_results.pkl']