In [None]:
from sklearn import datasets
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import GridSearchCV

digits = datasets.load_digits()
X = digits.data
y = digits.target
print(X.shape, y.shape)

Hyperopt example

In [None]:
def hyperopt_train_test(params):
    t = params['type']
    del params['type']
    if t == 'naive_bayes':
        clf = BernoulliNB(**params)
    elif t == 'svm':
        clf = SVC(**params)
    elif t == 'GradientBoostingClassifier':
        clf = GradientBoostingClassifier(**params)
    elif t == 'dtree':
        clf = DecisionTreeClassifier(**params)
    elif t == 'knn':
        clf = KNeighborsClassifier(**params)
    else:
        return 0
    return cross_val_score(clf, X, y).mean()

space = hp.choice('classifier_type', [
    {
        'type': 'naive_bayes',
        'alpha': hp.uniform('alpha', 0.0, 2.0)
    },
    {
        'type': 'svm',
        'C': hp.uniform('C', 0, 10.0),
        'kernel': hp.choice('kernel', ['linear', 'rbf']),
        'gamma': hp.uniform('gamma', 0, 20.0)
    },
    { #tpot best: GradientBoostingClassifier__learning_rate=DEFAULT, GradientBoostingClassifier__max_depth=4,
      # GradientBoostingClassifier__max_features=0.3, GradientBoostingClassifier__min_samples_leaf=12,
      # GradientBoostingClassifier__min_samples_split=19, GradientBoostingClassifier__n_estimators=DEFAULT,
      # GradientBoostingClassifier__subsample=0.45)
        'type': 'GradientBoostingClassifier',
        'learning_rate': hp.uniform('learning_rate', 0.1, 3.0),
        'max_depth': hp.choice('max_depth', range(1,10)),
        'max_features': hp.uniform('max_features', 0.1,1.0),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(1,30)),
        'min_samples_split': hp.choice('min_samples_split', range(2,30)),
        'n_estimators': hp.choice('n_estimators', range(10,200)),
        'subsample': hp.uniform('subsample', 0.1, 1.0),
    },
    {
        'type': 'randomforest',
        'max_depth': hp.choice('_max_depth', range(1,20)),
        'max_features': hp.choice('_max_features', range(1,5)),
        'n_estimators': hp.choice('_n_estimators', range(1,20)),
        'criterion': hp.choice('criterion', ["gini", "entropy"]),
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
    },
    {
        'type': 'knn',
        'n_neighbors': hp.choice('knn_n_neighbors', range(1,50))
    }
])

'''
space = hp.choice('classifier_type', [
    { #tpot best: GradientBoostingClassifier__learning_rate=DEFAULT, GradientBoostingClassifier__max_depth=4,
      # GradientBoostingClassifier__max_features=0.3, GradientBoostingClassifier__min_samples_leaf=12,
      # GradientBoostingClassifier__min_samples_split=19, GradientBoostingClassifier__n_estimators=DEFAULT,
      # GradientBoostingClassifier__subsample=0.45)
        'type': 'GradientBoostingClassifier',
        'learning_rate': hp.uniform('learning_rate', 0.1, 3.0),
        'max_depth': hp.choice('max_depth', range(1,10)),
        'max_features': hp.uniform('max_features', 0.1,1.0),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(1,30)),
        'min_samples_split': hp.choice('min_samples_split', range(2,30)),
        'n_estimators': hp.choice('n_estimators', range(10,200)),
        'subsample': hp.uniform('subsample', 0.1, 1.0),
    }
])
'''



count = 0
best = 0
def f(params):
    global best, count
    count += 1
    acc = hyperopt_train_test(params.copy())
    if acc > best:
        print 'new best:', acc, 'using', params['type']
        best = acc
    if count % 50 == 0:
        print 'iters:', count, ', acc:', acc, 'using', params
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=3500, trials=trials, verbose=3)
print 'best:'
print best

*tpot example*. Genetic algo search

In [None]:
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(population_size=3, verbosity=3, cv=2, generations=3)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')

let's try to optimize gradient boosting classifier with hyperopt. Hopefully we will obtain same parameters that tpot

mlxtend simple stacker with grid search

In [None]:
from sklearn.feature_selection import SelectFromModel

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target



# Initializing models

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = GradientBoostingClassifier(max_depth=4, max_features=0.3, min_samples_leaf=12, min_samples_split=19, subsample=0.45)
clf5 = ExtraTreesClassifier(criterion="entropy", max_features=0.35)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], 
                          meta_classifier=lr, use_probas=True, use_features_in_secondary=True)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'gradientboostingclassifier__max_depth': [4, 5],
          'gradientboostingclassifier__max_features': [0.3, 0.5],
          'gradientboostingclassifier__min_samples_leaf': [12, 20],
          'meta-logisticregression__C': [0.1, 10.0],
          'meta-logisticregression__dual': [True, False]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X, y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.4f' % grid.best_score_)

In [None]:
!cat tpot_mnist_pipeline.py

gcForest

1

In [None]:
from GCForest import gcForest
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.33)
gcf = gcForest(shape_1X=4, window=2, tolerance=0.0)
gcf.fit(X_tr, y_tr)
pred_X = gcf.predict(X_te)
print(pred_X)
accuracy = accuracy_score(y_true=y_te, y_pred=pred_X)
print('gcForest accuracy : {}'.format(accuracy))

2

In [None]:
gcf = gcForest(shape_1X=[8,8], window=[4,6], tolerance=0.0, min_samples_mgs=10, min_samples_cascade=7)
gcf.fit(X_tr, y_tr)
pred_X = gcf.predict(X_te)
print(pred_X)

In [None]:
accuracy = accuracy_score(y_true=y_te, y_pred=pred_X)
print('gcForest accuracy : {}'.format(accuracy))