### Load Config

In [5]:
from yaml import load, Loader
from bunch import Bunch

stream = open("config.yaml", 'r')
config = Bunch(load(stream, Loader=Loader))

### Read Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import get_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneGroupOut
import shap
import random

SCORING = ['roc_auc', 'accuracy', 'balanced_accuracy', 'f1_micro', 'average_precision', 'recall', 'precision']
PARAMETERS = {'n_estimators': [50, 100, 150], "max_features": ['sqrt', 'log2']}

SIGNAL_COMBOS = [['driver_behavior'], ['driver_behavior', 'vehicle_behavior'], 
            ['driver_behavior', 'vehicle_behavior', 'navi'], ['driver_behavior', 'vehicle_behavior', 'navi', 'radar']]

def collect_scores(scoring, y_true, y_pred):
    scores_dict = dict()
    for scorer in scoring:
        s = get_scorer(scorer)
        scores_dict[scorer] = s._score_func(y_true, y_pred)
    return scores_dict

pipeline = make_pipeline(StandardScaler(), PCA(), LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000, class_weight='balanced'))

for window_size in config.window_sizes:
    for combo in SIGNAL_COMBOS:
        signal_string = ''
        can_data_features = []
        for signal in combo:
            signal_string += '_' + signal
            can_data_features.append(pd.read_parquet('out/can_data_features_{}_windowsize_{}s.parquet'.format(signal, window_size)))
        can_data_features = pd.concat(can_data_features, axis=1)
        can_data_features.loc[:, 'label'] = 0
        can_data_features.loc[(slice(None), 'above', slice(None)), 'label'] = 1

        can_data_features.replace(np.nan, 10e6, inplace=True)

        # drop below BAC level for binary classification
        can_data_features = can_data_features.drop('below', level=1)

        for scenario in ['highway', 'rural', 'town']:
            print('signals: {}, window size: {}s, scenario: {}'.format(signal_string, window_size, scenario))

            can_data_features_bin = can_data_features.loc[:, :, scenario, :]

            groups = list(can_data_features_bin.index.get_level_values('subject_id'))
            subject_ids = np.unique(groups)
            
            X = can_data_features_bin.drop(columns='label').to_numpy()
            
            y = can_data_features_bin['label'].to_numpy()

            cv = cross_validate(estimator=pipeline, X=X, y=y, scoring=SCORING, return_estimator=True,
                    return_train_score=True, cv=LeaveOneGroupOut(), groups=groups, n_jobs=None)

            ind = random.choice(range(len(subject_ids)))
            feature_names = can_data_features_bin.columns.to_list()[:-1]
            explainer = shap.LinearExplainer(cv['estimator'][ind]['logisticregression'], X, feature_names=feature_names)
            shap_values = explainer.shap_values(X)

            shap_values = pd.DataFrame(shap_values)
            shap_values.columns = feature_names
            shap_values.set_index(can_data_features_bin.index).to_parquet(
                'out/shap_values_windowsize_{}{}_{}.parquet'.format(window_size, signal_string, scenario), index=True
            )

            exclude = ['estimator']
            
            pd.DataFrame({k:v for k,v in cv.items() if k not in exclude}).set_index(subject_ids).to_csv(
                    'out/pred_results_windowsize_{}{}_{}.csv'.format(window_size, signal_string, scenario), index=True, header=True
                    )