In [2]:
!sudo pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l[K     |███▎                            | 10 kB 24.8 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 31.0 MB/s eta 0:00:01[K     |█████████▉                      | 30 kB 37.6 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 38.7 MB/s eta 0:00:01[K     |████████████████▍               | 51 kB 28.4 MB/s eta 0:00:01[K     |███████████████████▋            | 61 kB 23.4 MB/s eta 0:00:01[K     |██████████████████████▉         | 71 kB 21.4 MB/s eta 0:00:01[K     |██████████████████████████▏     | 81 kB 23.3 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92 kB 25.2 MB/s eta 0:00:01[K     |████████████████████████████████| 100 kB 7.9 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [3]:
import functools
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
from scipy.stats import uniform
from timeit import default_timer as timer

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection._search import BaseSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV

from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Integer, Categorical, Real

In [4]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/erood/interviewqs.com_code_snippets/master/Datasets/winequality-red.csv",
    sep=";"
)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
df["quality"] = df.quality.where(df.quality >= 7, 0)
df["quality"] = df.quality.mask(df.quality >= 7, 1)
df.quality.value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-1]], df.quality, test_size=0.2)

In [7]:
def time_it(func):
    """Print the runtime of the decorated function"""
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = timer()    
        value = func(*args, **kwargs)
        end_time = timer()      
        run_time = timedelta(seconds=end_time - start_time)    
        print(f"Finished {func.__name__!r} in {run_time}")
        return value
    return wrapper_timer

def validate(model):
  print(model)
  for score in ["accuracy", "precision", "recall", "f1"]:
    print(f"{score}: {cross_val_score(model, X_test, y_test, scoring=score).mean()}")

In [8]:
N_ESTIMATORS = 120

In [9]:
# No hyperparameter tuning
@time_it
def no_hyperparams_tuning():
  model = RandomForestClassifier(n_jobs=-1, n_estimators=N_ESTIMATORS)
  scores = cross_val_score(model, X_train, y_train, scoring="f1", n_jobs=-1)
  return model

nht_model = no_hyperparams_tuning()
validate(nht_model)

Finished 'no_hyperparams_tuning' in 0:00:02.910925
RandomForestClassifier(n_estimators=120, n_jobs=-1)
accuracy: 0.884375
precision: 0.6833333333333333
recall: 0.21388888888888888
f1: 0.26969696969696966


In [None]:
# Grid search cv
@time_it
def grid_search_tuning():
  model_gs = RandomForestClassifier(n_jobs=-1, n_estimators=N_ESTIMATORS)

  param_grid = {
      "criterion": ["gini", "entropy"],
      "max_depth": [2, 4, 8, 16],
      "max_features": ["sqrt", "log2", None, .8, .6, .4, .2],
      "bootstrap": [True, False],
      "max_samples": [None, .8, .6, .4, .2]
  }
  scores = ["accuracy", "precision", "recall", "f1"]
  gs = GridSearchCV(model_gs, param_grid, scoring="f1", cv=5, n_jobs=-1)
  gs.fit(X_train, y_train)
  return gs

gs = grid_search_tuning()

validate(gs.best_estimator_)

In [None]:
# Random search cv
@time_it
def random_search_tuning():
  model_rs = RandomForestClassifier(n_jobs=-1, n_estimators=N_ESTIMATORS)

  param_grid = dict(
    criterion=["gini", "entropy"],
    max_depth=uniform(loc=1, scale=31),
    max_features=uniform(),
    bootstrap=[True, False],
    max_samples=uniform()
  ) 
  scores = ["accuracy", "precision", "recall", "f1"]
  rs = RandomizedSearchCV(model_rs, param_grid, scoring="f1", n_iter=100, cv=5, n_jobs=-1)
  rs.fit(X_train, y_train)
  return rs

rs = random_search_tuning()

validate(rs.best_estimator_)

Finished 'random_search_tuning' in 0:02:52.362807
RandomForestClassifier(bootstrap=False, criterion='entropy',
                       max_depth=20.9684115368937,
                       max_features=0.3375892533569673,
                       max_samples=0.7651692445440266, n_estimators=120,
                       n_jobs=-1)
accuracy: 0.89375
precision: 0.5533333333333333
recall: 0.2523809523809524
f1: 0.29317460317460314


In [None]:
@time_it
def halving_grid_search_tuning():
  model_gs = RandomForestClassifier(n_jobs=-1, n_estimators=N_ESTIMATORS)

  param_grid = {
      "criterion": ["gini", "entropy"],
      "max_depth": [2, 4, 8, 16],
      "max_features": ["sqrt", "log2", None, .8, .6, .4, .2],
      "bootstrap": [True, False],
      "max_samples": [None, .8, .6, .4, .2]
  }
  scores = ["accuracy", "precision", "recall", "f1"]
  hgs = HalvingGridSearchCV(
      model_gs, param_grid, scoring="f1", cv=5, factor=2, resource='n_estimators', n_jobs=-1, max_resources=128
      )
  hgs.fit(X_train, y_train)
  return hgs

hgs = halving_grid_search_tuning()

validate(hgs.best_estimator_)

Finished 'halving_grid_search_tuning' in 0:10:32.223032
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=16,
                       max_features='log2', max_samples=0.4, n_estimators=128,
                       n_jobs=-1)
accuracy: 0.890625
precision: 0.5633333333333332
recall: 0.2523809523809524
f1: 0.33999999999999997


In [20]:
@time_it
def bayesian_opt_tuning():
  search_space = [
                  Categorical(["gini", "entropy"], name="criterion"),
                  Integer(2, 16, name="max_depth"),
                  Real(0.1, 1.0, name="max_features"),
                  #Categorical([True, False], name="bootstrap"),
                  Real(0.1, 1.0, name="max_samples")
                  ]

  model_bo = RandomForestClassifier(n_jobs=-1, n_estimators=N_ESTIMATORS)

  # define the function used to evaluate a given configuration
  @use_named_args(search_space)
  def evaluate_model(**params):
    # something
    model_bo.set_params(**params)
    model_bo.fit(X_train, y_train)
    # calculate 5-fold cross validation
    result = cross_val_score(model_bo, X_train, y_train, cv=5, n_jobs=-1, scoring="f1")
    # calculate the mean of the scores
    estimate = np.mean(result)
    return 1.0 - estimate

  # perform optimization
  result = gp_minimize(evaluate_model, search_space)
  # summarizing finding:
  print('Best F1 score: %.3f' % (1.0 - result.fun))
  print('Best Parameters: %s' % result.x)
  return result

result = bayesian_opt_tuning()



Best F1 score: 0.608
Best Parameters: ['gini', 12, 0.9580420036161799, 0.9911174690768271]
Finished 'bayesian_opt_tuning' in 0:06:03.755582


In [21]:
model = RandomForestClassifier(
    criterion="gini", max_depth=12,max_features=0.9580420036161799, max_samples=0.9911174690768271,n_jobs=-1, n_estimators=N_ESTIMATORS
    )

model.fit(X_train, y_train)
validate(model)

RandomForestClassifier(max_depth=12, max_features=0.9580420036161799,
                       max_samples=0.9911174690768271, n_estimators=120,
                       n_jobs=-1)
accuracy: 0.890625
precision: 0.5566666666666668
recall: 0.33055555555555555
f1: 0.39856143856143855
