In [None]:
import numpy as np, pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from bokeh.io import output_notebook
output_notebook()

from hyperopt import hp, tpe, rand

from cvopt import hyperoptCV

# Parameter search

In [None]:
# load data
dataset = datasets.load_breast_cancer()
Xtrain, Xtest, ytrain, ytest = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=0)
Xtest, Xvalid, ytest, yvalid = train_test_split(Xtest, ytest, test_size=0.5, random_state=0)

print(Xtrain.shape, Xtest.shape, Xvalid.shape)
print(ytrain.shape, ytest.shape, yvalid.shape)

In [None]:
estimator = LogisticRegression()
param_distributions = {
    'random_state':0,
    'n_jobs':1,
    'penalty': hp.choice("penalty", ['l1', 'l2']),
    'C': hp.choice("C", [1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]),
    'tol' : hp.choice("tol", [1e-4, 1e-3, 1e-2]),
    'class_weight' : hp.choice("class_weight", [None, "balanced"]),
    }

hpcv = hyperoptCV(estimator, param_distributions, 
                  scoring="roc_auc",                                             # Objective of search
                  cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=0), # Cross validation setting
                  max_evals=32,                                                  # Number of search
                  n_jobs=3,                                                      # Number of jobs to run in parallel.
                  random_state=0,                                                # seed in hyperopt
                  verbose=2,                                                     # 0: don't display status, 1:display status by stdout, 2:display status by graph 
                  logdir=None,                                                   # If this path is specified, save the log.
                  model_id=None,                                                 # used estimator's dir and file name in save.
                  save_estimator=False
                  )

hpcv.fit(Xtrain, ytrain, validation_data=(Xvalid, yvalid))
ytest_pred = hpcv.predict(Xtest)

In [None]:
pd.DataFrame(hpcv.cv_results_).head()

# Feature selection(& Parameter search)

In [None]:
hpcv = hyperoptCV(estimator, param_distributions, 
                  scoring="roc_auc",
                  cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=0), 
                  max_evals=32,
                  n_jobs=3,
                  random_state=0,
                  verbose=2, 
                  logdir=None, 
                  model_id=None,
                  save_estimator=False
                  )

# Group labels for the features.
# Searching select or not for each group.
feature_groups = np.random.randint(0, 5, Xtrain.shape[1]) 

# if value is "-100", this feature always used.
feature_groups[0] = -100 

hpcv.fit(Xtrain, ytrain, validation_data=(Xvalid, yvalid), feature_groups=feature_groups)

# When predict,the selection result is reflected.
# (Not selected featires are excluded internally)
ytest_pred = hpcv.predict(Xtest)

# Cross validation class list
* hyperoptCV: based on hyperopt

In [None]:
estimator

In [None]:
estimator = LogisticRegression()
param_distributions = {
    'random_state':0,
    'n_jobs':1,
    'penalty': hp.choice("penalty", ['l1', 'l2']),
    'C': hp.choice("C", [1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]),
    'tol' : hp.choice("tol", [1e-4, 1e-3, 1e-2]),
    'class_weight' : hp.choice("class_weight", [None, "balanced"]),
    }
hpcv = hyperoptCV(estimator, param_distributions, )
hpcv.fit(Xtrain, ytrain)

In [None]:
estimator = LogisticRegression()
param_distributions = {
    'random_state':0,
    'n_jobs':1,
    'penalty': hp.choice("penalty", ['l1', 'l2']),
    'C': hp.choice("C", [1e-3, 1e-2, 1e-1, 1e-0, 1e1, 1e2, 1e3]),
    'tol' : hp.choice("tol", [1e-4, 1e-3, 1e-2]),
    'class_weight' : hp.choice("class_weight", [None, "balanced"]),
    }

hpcv = hyperoptCV(estimator, param_distributions, 
                  scoring="roc_auc",                                             # Objective of search
                  cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=0), # Cross validation setting
                  max_evals=32,                                                  # Number of search
                  n_jobs=3,                                                      # Number of jobs to run in parallel.
                  random_state=0,                                                # seed in hyperopt
                  verbose=2,                                                     # 0: don't display status, 1:display status by stdout, 2:display status by graph 
                  logdir=None,                                                   # If this path is specified, save the log.
                  model_id=None,                                                 # used estimator's dir and file name in save.
                  save_estimator=False
                  )

hpcv.fit(Xtrain, ytrain)
#ytest_pred = hpcv.predict(Xtest)