# Optimization
## a) Based on Model Selection exercise, we choose Decision Tree model to optimize with Bayesian Optimization


In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics
import sklearn.model_selection
from pathlib import Path
SEED = 42 # for reproducibility

### Retrieve Data

In [2]:
df = pd.read_csv(Path("../data/clean_data.csv"))
X = df.loc[:, df.columns.drop(['label'])]
y = df.loc[:, 'label']

In [3]:
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED)

### Parameters to optimize

In [4]:
params = {
    #'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : np.arange(1, 100),
    #'min_samples_split': np.arange(2, 100)
    #'criterion' :['gini', 'entropy'],
    #'max_leaf_nodes': np.arange(2, 100),
    #'min_samples_split': [2, 3, 4],
    #'ccp_alpha': [0.1, .01, .001],
    }

### Bayesian Optimization

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=SEED)
# define the search
search = BayesSearchCV(estimator=DecisionTreeClassifier(random_state=SEED), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(X_train, y_train)
# report the best result
print(search.best_score_)
print(search.best_params_)
print(search.best_estimator_)



0.9870967741935484
OrderedDict([('max_depth', 52)])
DecisionTreeClassifier(max_depth=52, random_state=42)


### Evaluation on test dataset

In [6]:
# evaluate on test
predictions = search.best_estimator_.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score 0.974025974025974


In [7]:
# use default model with max_depth=4
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
#DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='auto', random_state=42)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
clf

Accuracy score 0.974025974025974


DecisionTreeClassifier(max_depth=4, random_state=42)

## b) Compare against automl using a fitting crossvalidation on resampling stategy

In [8]:
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    disable_evaluator_output=False,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10},
)
automl.fit(X_train, y_train)
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: bb6638e8-1edd-11ed-8bc3-696cf0d06187
  Metric: accuracy
  Best validation score: 0.983713
  Number of target algorithm runs: 12
  Number of successful target algorithm runs: 11
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



In [9]:
predictions = automl.predict(X_test)
print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score CV 0.961038961038961


Perform refit. During fit(), models are fit on individual cross-validation folds. To use all available data, we call refit() which trains all models in the final ensemble on the whole dataset.

In [10]:
automl.refit(X_train.copy(), y_train.copy())
predictions = automl.predict(X_test)
print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score CV 0.961038961038961


## Conclusion
Bayessian Optimization shows the same result for max_depth=52 and max_depth=4
Therefore, we choose the Decision Tree Classifier with max_depth=4 for Deployment