In [194]:
# 04_grid_random_search (n)

# 1. Zaimplementuj GridSearchCV oraz RandomizedSearchCV dla datasetu wine. Uwzględnij poniższe parametry:
# - estymator: LogisticRegression(solver="liblinear")
#  - parametr C:
#     - min 1
#     - max 10 000
#     - liczba wystąpień 1 000
#  - regularyzacja l1 oraz l2

# 2. Zaimplementuj GridSearchCV (jeden na wszystkie modele) w celu znalezienia najlepszego algorytmu oraz 
# hyperparametrów dla datasetu z pkt.1:
#  - wykorzystaj estymatory:
#     - RandomForestClassifier
#     - KNeighborsClassifier
#     - LogisticRegression

# 3. Porównaj wyniki korzystając z  hyperopt-sklearn.

# Rozwiązanie prześlij jako printscreen (jpg, pdf).
# Nazwa pliku (bez polskich znaków):

# Nazwisko_Imie_04_stacj.jpg

# 1

## Grid Search

In [1]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [4]:
wine = load_wine()
features = wine.data
target = wine.target

In [5]:
logistic = LogisticRegression(solver="liblinear")

In [6]:
penalty = ["l1", "l2"]

In [7]:
# https://numpy.org/doc/stable/reference/generated/numpy.logspace.html
C = np.logspace(0, 4, 1000)

In [8]:
hyperparameters = dict(C=C, penalty=penalty)

In [9]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=2, n_jobs=-1)

In [10]:
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


In [11]:
print(best_model.best_estimator_.get_params()['penalty'])
print(best_model.best_estimator_.get_params()['C'])

l2
1.6758078645307677


## Randomized Search

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
randomizedsearch = RandomizedSearchCV(
logistic,
hyperparameters,
random_state=1,
n_iter=1000,
cv=5,
verbose=0,
n_jobs=-1
)

In [22]:
best_random_model = randomizedsearch.fit(features, target)

In [23]:
print(best_random_model.best_estimator_.get_params()['penalty'])
print(best_random_model.best_estimator_.get_params()['C'])

l2
2.1693835183851844


# 2

## Pipeline

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [25]:
pipe = Pipeline([("classifier", RandomForestClassifier())])

In [26]:
search_space = [
    {"classifier": [logistic],
    "classifier__penalty": ["l1", "l2"],
    "classifier__C": np.logspace(0, 4, 10)},
    {"classifier": [RandomForestClassifier()],
    "classifier__n_estimators": [10, 50, 100],
    "classifier__max_features": [1, 2, 3]},
    {"classifier": [KNeighborsClassifier()],
    "classifier__n_neighbors": range(1, 10, 1),
    "classifier__leaf_size": [30, 60, 90]}
]

In [27]:
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=1, n_jobs=-1)

In [28]:
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


In [29]:
print(best_model.best_estimator_.get_params()["classifier"])

RandomForestClassifier(max_features=1, n_estimators=50)


# 3

## Hyperopt

In [30]:
from sklearn.model_selection import train_test_split
from hpsklearn import HyperoptEstimator
from hpsklearn import any_classifier
from hpsklearn import any_preprocessing
from hyperopt import tpe

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [31]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)

In [32]:
model = HyperoptEstimator(
    classifier=any_classifier("cla"),
    preprocessing=any_preprocessing("pre"),
    algo=tpe.suggest,
    max_evals=20,
    trial_timeout=30,
    n_jobs=-1
)

In [33]:
model.fit(X_train, y_train)

100%|█████████████████████████████████████| 1/1 [00:01<00:00,  1.46s/trial, best loss: 0.125]
100%|███████████████████████| 2/2 [00:02<00:00,  2.16s/trial, best loss: 0.04166666666666663]
100%|███████████████████████| 3/3 [00:01<00:00,  1.41s/trial, best loss: 0.04166666666666663]
100%|███████████████████████████████████████| 4/4 [00:01<00:00,  1.43s/trial, best loss: 0.0]
100%|███████████████████████████████████████| 5/5 [00:01<00:00,  1.44s/trial, best loss: 0.0]
100%|███████████████████████████████████████| 6/6 [00:01<00:00,  1.41s/trial, best loss: 0.0]
100%|███████████████████████████████████████| 7/7 [00:02<00:00,  2.58s/trial, best loss: 0.0]
100%|███████████████████████████████████████| 8/8 [00:01<00:00,  1.46s/trial, best loss: 0.0]
100%|███████████████████████████████████████| 9/9 [00:01<00:00,  1.47s/trial, best loss: 0.0]
100%|█████████████████████████████████████| 10/10 [00:01<00:00,  1.99s/trial, best loss: 0.0]
100%|█████████████████████████████████████| 11/11 [00:01<00:

In [34]:
acc = model.score(X_test, y_test)
print("Accuracy: %.3f" % acc)

Accuracy: 0.949


In [35]:
print(model.best_model())

{'learner': GradientBoostingClassifier(learning_rate=0.022602027695498735, max_depth=4,
                           n_estimators=22, random_state=4,
                           subsample=0.9548867000480767), 'preprocs': (), 'ex_preprocs': ()}
