In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/ML_Project2/ML_Project2/scripts/

Mounted at /content/drive
/content/drive/MyDrive/ML_Project2/ML_Project2/scripts


In [3]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GroupKFold

## Custom libraries
import index_helpers as ih

### Import and prep data

In [4]:
## Import, index, and split

segmentation = True
fine_segmentation = True

# TAKE CARE: change of parameters for read and merge:
df = ih.read_and_merge_data(segmentation, fine_segmentation)
df = ih.index_df_by_person(df)
df = ih.categorical_float_to_int(df)
df = ih.categorical_to_dummy(df)
X_train, X_val, y_train, y_val = ih.train_test_split_on_index(features = df.drop("Label", axis=1),
                                                             label = df["Label"])

## Modify data for GroupKFold
groups = y_train.reset_index()['File_Name_split']
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

### Create pipeline and fit classifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier

## Make pipeline - name classifier "clf"
clf_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("clf", KNeighborsClassifier())])

## Use "clf__" in order to correctly assign parameters to the clf object
clf_param_grid = {'clf__n_neighbors': list(range(1,30)),
                  'clf__leaf_size': list(range(1,50)),
                  'clf__p': [1, 2]}

## Instantiate GroupKFold to avoid data leakage - to be passed to cv
gkf=GroupKFold(n_splits=10)

## Set up Randomized search CV
clf_rand_auc = RandomizedSearchCV(estimator=clf_pipeline,
                                  param_distributions=clf_param_grid,
                                  cv=gkf, scoring='roc_auc', verbose=1, n_jobs=2, n_iter=50)

## Perform Group K-Cross-validation
clf_rand_auc.fit(X_train, y_train, groups=groups)

## Print results
print("Best score: ",  clf_rand_auc.best_score_)
print("Best estimator: ", clf_rand_auc.best_estimator_)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  6.2min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed: 15.5min
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed: 16.8min finished


Best score:  0.5422848963156615
Best estimator:  Pipeline(memory=None,
         steps=[('st_scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 KNeighborsClassifier(algorithm='auto', leaf_size=25,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=24, p=2,
                                      weights='uniform'))],
         verbose=False)
