In [1]:
import pandas as pd
import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GroupKFold

## Custom libraries
import index_helpers as ih

  import pandas.util.testing as tm


### Import and prep data

In [4]:
## Import, index, and split
df, cat_feat, num_feat, all_feat = ih.read_and_merge_segmented_data(exlude_expert=False, exclude_meta_data=False)
df = ih.index_df_by_person(df)
#df = ih.categorical_float_to_int(df)
#df = ih.categorical_to_dummy(df)
df = pd.get_dummies(df, columns=['Resp_Condition', 'Gender', 'Symptoms'])

X_train, X_val, y_train, y_val = ih.train_test_split_on_index(features = df.drop("Label", axis=1),
                                                             label = df["Label"])

## Modify data for GroupKFold
groups = y_train.reset_index()['File_Name_split']
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

### Create pipeline and fit classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

## Make pipeline - name classifier "clf"
clf_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("clf", RandomForestClassifier())])

## Use "clf__" in order to correctly assign parameters to the clf object
clf_param_grid = {'clf__bootstrap': [True, False],
                  'clf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                  'clf__max_features': ['auto', 'sqrt'],
                  'clf__min_samples_leaf': [1, 2, 4, 8],
                  'clf__min_samples_split': [2, 5, 10],
                  'clf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

## Instantiate GroupKFold to avoid data leakage - to be passed to cv
gkf=GroupKFold(n_splits=10)

## Set up Randomized search CV
clf_rand_auc = RandomizedSearchCV(estimator=clf_pipeline,
                                  param_distributions=clf_param_grid,
                                  cv=gkf, scoring='roc_auc', verbose=1, n_jobs=-1, n_iter=30)

## Perform Group K-Cross-validation
clf_rand_auc.fit(X_train, y_train, groups=groups)

## Print results
print("Best score: ",  clf_rand_auc.best_score_)
print("Best estimator: ", clf_rand_auc.best_estimator_)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 55.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 83.2min finished


Best score:  0.7057765893634381
Best estimator:  Pipeline(steps=[('st_scaler', StandardScaler()),
                ('clf',
                 RandomForestClassifier(max_depth=60, min_samples_leaf=8,
                                        min_samples_split=5,
                                        n_estimators=1000))])
