In [10]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
## Custom libraries
import index_helpers as ih
from data_transformations import transformation_call
from sklearn.metrics import roc_auc_score
from data_transformations import low_var_exclusion

In [11]:
#choice of segmentation
segmentation = True
fine_segmentation=True

#Try Expert, Metadata, Normalization, Power
exclude_expert=False
exclude_meta_data=False
normalization=False
power=False
treshold=0.1

## Import, index, and split
df = ih.read_and_merge_data(segmentation, fine_segmentation, exclude_expert, exclude_meta_data)
df = transformation_call(df, normalization, power)
df = ih.index_df_by_person(df)
df = low_var_exclusion(df, treshold)
df = ih.categorical_float_to_int(df)
df = ih.categorical_to_dummy(df)


X_train, X_val, y_train, y_val = ih.train_test_split_on_index(df.drop('Label', axis=1),
                                                             df['Label'])

## Modify data for GroupKFold
groups = y_train.reset_index()['File_Name_split']
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [13]:
### Logistic regression
from sklearn.linear_model import LogisticRegression

## Make pipeline - name classifier "clf"
clf_pipeline = Pipeline([("clf", LogisticRegression())])

## Use "clf__" in order to correctly assign parameters to the clf object
clf_param_grid = {'clf__penalty':['l1', 'l2'],
                  'clf__tol':[ 0.0001],
                  'clf__C':[2.0, 1.5, 1.0], 
               'clf__intercept_scaling':[1], 
                  'clf__max_iter':[500],
                 'clf__solver':[ 'lbfgs', 'liblinear', 'sag', 'saga']}

## Instantiate GroupKFold to avoid data leakage - to be passed to cv
gkf=GroupKFold(n_splits=10)

## Set up Randomized search CV
clf_rand_auc = RandomizedSearchCV(estimator=clf_pipeline,
                                  param_distributions=clf_param_grid,
                                  cv=gkf, scoring='roc_auc', verbose=1, n_jobs=2, n_iter=50)

## Perform Group K-Cross-validation
clf_rand_auc.fit(X_train, y_train, groups=groups)

## Print results
print("Best score: ",  clf_rand_auc.best_score_)
print("Best estimator: ", clf_rand_auc.best_estimator_)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done  66 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done 216 tasks      | elapsed: 22.6min
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed: 27.1min finished


Best score:  0.691348949243093
Best estimator:  Pipeline(steps=[('clf',
                 LogisticRegression(C=2.0, max_iter=500, solver='liblinear'))])
