In [2]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GroupKFold

## Custom libraries
import index_helpers as ih

from importlib import reload
import data_transformations as dt

### Import and prep data

In [7]:
reload(dt)
reload(ih)
## Import, index, and split

segmentation = False
fine_segmentation = False
exclude_expert =False
exclude_meta_data=False

normalization=False
power=False


df = ih.read_and_merge_data(segmentation, fine_segmentation, exclude_expert, exclude_meta_data=False)
df.head()
df = dt.transformation_call(df, normalization, power)

if segmentation:
    df = ih.index_df_by_person(df)
else:
    df = df.set_index(['File_Name'])
df = ih.categorical_float_to_int(df) #d'ailleurs elle ne sert à rien cette fonction
df = ih.categorical_to_dummy(df, include_expert_as_dummies=False, exclude_meta_data=False)
df1, df2, df3 = ih.separate_expert(df)

In [27]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import numpy as np

aucs = []
bestscores = []
bestparams = []

for df_ in df1, df2, df3, df:

    if segmentation:
        X_train, X_val, y_train, y_val = ih.train_test_split_on_index(features = df_.drop("Label", axis=1),
                                                                label = df_["Label"])
        groups = y_train.reset_index()['File_Name_split']
    else:
        from sklearn.model_selection import train_test_split
        X_train, X_val, y_train, y_val = train_test_split(df_.drop("Label", axis=1), df_["Label"], test_size=0.2)
        groups = y_train.reset_index()['File_Name']

    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)


    ## Make pipeline - name classifier "clf"
    clf_pipeline = Pipeline([("st_scaler", StandardScaler()),
                            ("clf", SVC(probability=True))]) ######!!!!!!!!!!!!!!!!!!!!!!!!!!!! Warning should be SVC()

    ## Use "clf__" in order to correctly assign parameters to the clf object
    clf_param_grid = {'clf__C': [0.1, 1, 10, 100, 1000, 10000],
                      'clf__tol': [0.0001], # bc it's not really useful to tune that param ?d
                      'clf__gamma': np.linspace(0.00001, 10),
                      'clf__degree':list(range(1,20))}
    ## Instantiate GroupKFold to avoid data leakage - to be passed to cv
    gkf=GroupKFold(n_splits=2)

    ## Set up Randomized search CV --> modulate n_iter for "quicker" results
    clf_rand_auc = RandomizedSearchCV(estimator=clf_pipeline,
                                      param_distributions=clf_param_grid,
                                      cv=gkf, scoring='roc_auc', verbose=1, n_jobs=2, n_iter=50)

    ## Perform Group K-Cross-validation
    clf_rand_auc.fit(X_train, y_train, groups=groups)
    bestscores.append(clf_rand_auc.best_score_)
    bestparams.append(clf_rand_auc.best_params_)

    ## Prediction
    pred = clf_rand_auc.predict(X_val)

    ## AUC
    auc = roc_auc_score(y_val, pred)
    aucs.append(auc)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   13.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   29.1s finished
