In [None]:
## Library Imports
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import imblearn as imbl
import xgboost as xgb
from hyperopt import hp, tpe, fmin
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GroupKFold
from sklearn.metrics import plot_roc_curve, roc_curve, auc
from sklearn.linear_model import LogisticRegression

#from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
##from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score

#from sklearn_pandas import DataFrameMapper

## Custom libraries
import index_helpers as ih
import data_transformations as dtrans

In [None]:
## Import, index, and split
df, cat_feat, num_feat, all_feat = ih.read_and_merge_segmented_data(exlude_expert=False, exclude_meta_data=False)
df = ih.index_df_by_person(df)
df = dtrans.low_var_exclusion(df, 0.1)

### EXTR MODIFICATION ###
df = df[df["Expert"]!=2]

df = pd.get_dummies(df, columns=['Resp_Condition', 'Gender'])
## Removed feature - Symptoms
##########################

X_train, X_test, y_train, y_test = ih.train_test_split_on_index(features = df.drop("Label", axis=1),
                                                                label = df["Label"])

#X_train = df.drop("Label", axis=1)
#y_train = df["Label"]

## Train naive regression model
#logit_naive = LogisticRegression(max_iter = 1000).fit(X_train, y_train)
#logit_naive.score(X_test, y_test)

## Undersampling

In [None]:
X_t = X_train.reset_index(drop=False)
y_t = y_train.reset_index(drop=False)

In [None]:
from imblearn.over_sampling import RandomOverSampler
RUS = RandomOverSampler(random_state=42)
X_res, y_res = RUS.fit_resample(X_t, y_t["Label"])

df_res = X_res.merge(y_res, left_index=True, right_index=True)
df_res.set_index(['File_Name_split', 'File_n_recording'])

In [None]:
X = df_res.drop(columns=['File_Name_split', 'File_n_recording', 'Label'])
y = df_res['Label']
groups = df_res["File_Name_split"]

## AUC Naive LogReg

In [None]:
#plot_roc_curve(logit_naive, X_test, y_test)

In [None]:
## Modify data for GroupKFold
#df_reset = df.reset_index()
#X = X_train.reset_index(drop=True)
#y = y_train.reset_index(drop=True)
#groups = y_train.reset_index()['File_Name_split']

### Bayesian Optimization

In [None]:
param_hyperopt = {'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(1)),
                  'max_depth': hp.quniform('max_depth', 20, 100, 5),
                  'max_delta_step': hp.quniform('max_delta_step', 0, 20, 1),
                  'gamma': hp.uniform ('gamma', 1,9),
                  'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
                  'reg_lambda' : hp.uniform('reg_lambda', 0,1),
                  'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
                  'min_child_weight' : hp.quniform('min_child_weight', 0, 20, 1),
                  'n_estimators': hp.quniform('n_estimators', 50, 300, 10)}

def objective(params):
    
    ### Casting variables
    params = {'learning_rate': float(params['learning_rate']),
              'max_depth': int(params['max_depth']),
              'max_delta_step': int(params['max_delta_step']),
              'gamma': int(params['gamma']),
              'reg_alpha': int(params['reg_alpha']),
              'reg_lambda': float(params['reg_lambda']),
              'colsample_bytree': float(params['colsample_bytree']),
              'min_child_weight': int(params['min_child_weight']),
              'n_estimators': int(params['n_estimators'])}
    
    xgb_clf = xgb.XGBClassifier(objective='binary:logistic',**params)
    
    gkf=GroupKFold(n_splits=5)
    best_score = cross_val_score(xgb_clf, X, y, cv=gkf, groups=groups, 
                                 scoring='roc_auc', n_jobs=-1).mean()
    
    return -best_score
    
best_result = fmin(fn=objective, space=param_hyperopt, max_evals=35, algo=tpe.suggest, rstate=np.random.RandomState(42))
best_result

In [None]:
best_result_cast = {'learning_rate': float(best_result['learning_rate']),
                  'max_depth': int(best_result['max_depth']),
                    'max_delta_step': int(best_result['max_delta_step']),
                  'gamma': int(best_result['gamma']),
                  'reg_alpha': int(best_result['reg_alpha']),
                  'reg_lambda': float(best_result['reg_lambda']),
                  'colsample_bytree': float(best_result['colsample_bytree']),
                  'min_child_weight': int(best_result['min_child_weight']),
                  'n_estimators': int(best_result['n_estimators'])}

## lets test
best_clf = xgb.XGBClassifier(objective='binary:logistic', **best_result_cast)

best_clf.fit(X, y)

## Plot ROC

In [None]:
X_v = X_test.reset_index(drop=True)
y_v = y_test.reset_index(drop=True)

preds = best_clf.predict_proba(X_v)

plot_roc_curve(best_clf, X_v, y_v)

## XGBoost performance

Removing one expert at a time:
- Expert 3: AUC=0.72
- Expert 2: AUC=0.77
- Expert 1: AUC=0.73

Using ONLY ONE expert at a time:
- Expert 1: 0.57
- Expert 2: identical to random guess
- Expert 3: 0.53...

## TEST FROM LARAS DATA

In [None]:
X.columns

In [None]:
from pathlib import Path
PATH_test = Path('../data/features_test_fine_segmentation.csv')

X_attempt = pd.read_csv(PATH_test, header=None)
X_attempt.columns = all_feat
#X_attempt = pd.get_dummies(X_attempt, columns=['Resp_Condition', 'Gender'])
X_attempt = X_attempt[X.columns]

In [None]:
X_attempt

In [None]:
preds = best_clf.predict_proba(X_attempt)

In [None]:
plt.plot(preds)

In [None]:
import csv

with open('features_test_fine_segmentation_predictions.csv', 'w') as f:
    for d in preds[:,1]:
        f.write(str(d))
        f.write("\n")