### Load Config

In [30]:
from yaml import load, Loader
from bunch import Bunch

stream = open("config.yaml", 'r')
config = Bunch(load(stream, Loader=Loader))

### Read Data

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import get_scorer

SCORING = ['roc_auc', 'accuracy', 'balanced_accuracy', 'f1_micro', 'average_precision', 'recall', 'precision']
PARAMETERS = {'n_estimators': [50, 100, 150], "max_features": ['sqrt', 'log2']}

SIGNALS = ['driver_behavior', 'vehicle_behavior', 'radar', 'navi']

def collect_scores(scoring, y_true, y_pred):
    scores_dict = dict()
    for scorer in scoring:
        s = get_scorer(scorer)
        scores_dict[scorer] = s._score_func(y_true, y_pred)
    return scores_dict

for signal in SIGNALS:
    for window_size in config.window_sizes:
        can_data_features = pd.read_parquet('out/can_data_features_{}_windowsize_{}s.parquet'.format(signal, window_size))
        can_data_features['label'] = 0
        can_data_features.loc[(slice(None), 'above', slice(None)), 'label'] = 1
        print('signal type: {}, window size: {}s'.format(signal, window_size))

        # drop below BAC level for binary classification
        can_data_features_bin = can_data_features.drop('below', level=1)

        subject_ids = np.unique(list(can_data_features_bin.index.get_level_values('subject_id')))

        # take one subject out to use as test set
        for subject_id in subject_ids:
            print('prediction on subject {}'.format(subject_id))
            can_data_features_bin_test = can_data_features_bin.loc[subject_id, :]
            can_data_features_bin_train = can_data_features_bin.drop(subject_id).droplevel(['subject_state', 'subject_scenario', 'datetime'])

            # scores = cross_validate(estimator=RandomForestClassifier(class_weight='balanced'),
            #         X=RobustScaler().fit_transform(can_data_features_bin_train.drop(columns='label')), y=can_data_features_bin_train['label'], 
            #         scoring=SCORING, 
            #         cv=LOGO, groups=can_data_features_bin_train.index.to_numpy(), n_jobs=len(subject_ids)-1)

            rf = RandomForestClassifier(class_weight='balanced', n_jobs=len(subject_ids)-1)
            X = RobustScaler().fit_transform(can_data_features_bin_train.drop(columns='label'))
            rf.fit(X, y=can_data_features_bin_train['label'])

            y_true = can_data_features_bin_test['label']
            y_pred = rf.predict(can_data_features_bin_test.drop(columns='label'))

            pd.DataFrame([collect_scores(SCORING, y_true, y_pred)]).to_csv(
                'out/pred_results_{}_{}_windowsize_{}.csv'.format(subject_id, signal, window_size), index=True, header=True)

signal type: driver_behavior, window size: 150s
prediction on subject 001
