### Hyperparameter tuning using GridSearch

In [1]:
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

Load Data

In [2]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

Create Train and Test Datasets

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Fit GridSearch with multiple params

In [4]:
dt_params = {
    'min_samples_split': [1, 5, 10, 20, 50, 100],
    'min_samples_leaf': [1, 5, 10, 20, 50, 100],
    'max_leaf_nodes': [2, 4, 6],
    'criterion': ['gini', 'entropy'],
    'random_state': [42],
}

In [5]:
dt_clf = DecisionTreeClassifier()

In [None]:
grid_search = GridSearchCV(dt_clf, 
                           dt_params, 
                           scoring='accuracy',
                           n_jobs=-1, 
                           cv=5)
grid_search.fit(X_train, y_train)

In [9]:
grid_search.best_score_

0.8530000000000001

In [10]:
best_params = grid_search.best_params_
best_params

{'criterion': 'gini',
 'max_leaf_nodes': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'random_state': 42}

In [14]:
grid_search.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=4, min_samples_split=5, random_state=42)

Fit the Decision Tree with best params and check validation accuracy

In [11]:
dt_clf = DecisionTreeClassifier(**best_params)
dt_clf.fit(X, y)

DecisionTreeClassifier(max_leaf_nodes=4, min_samples_split=5, random_state=42)

In [12]:
val_pred = dt_clf.predict(X_val)

In [13]:
# check accuracy
accuracy_score(y_val, val_pred)

0.872

### Grow a Forest with Decision Trees

In [19]:
from sklearn.model_selection import ShuffleSplit

In [20]:
n_trees = 1_000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=1000, 
                  test_size=len(X_train) - n_instances, 
                  random_state=42)

for train_index, test_index in rs.split(X_train):
    X_mini_train = X[train_index]
    y_mini_train = y[train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [25]:
from sklearn.base import clone
import numpy as np

# create forest
forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_val)
    accuracy_scores.append(accuracy_score(y_val, y_pred))
    
np.mean(accuracy_scores)
    

0.836552

In [26]:
import pandas as pd

In [27]:
scores = pd.DataFrame({'score': accuracy_scores})
scores.head()

Unnamed: 0,score
0,0.842
1,0.818
2,0.853
3,0.845
4,0.8285


In [28]:
scores.score.describe()

count    1000.000000
mean        0.836552
std         0.027777
min         0.680500
25%         0.822000
50%         0.844500
75%         0.857500
max         0.874000
Name: score, dtype: float64

In [29]:
scores.score.value_counts()

0.8580    20
0.8585    18
0.8540    17
0.8625    17
0.8555    16
          ..
0.7715     1
0.7965     1
0.7920     1
0.7980     1
0.8740     1
Name: score, Length: 209, dtype: int64

In [30]:
Y_pred = np.empty([n_trees, len(X_val)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_val)

In [37]:
Y_pred.shape

(1000, 2000)

In [31]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [41]:
y_pred_majority_votes.reshape([-1]).shape

(2000,)

In [39]:
n_votes.shape

(1, 2000)

In [43]:
accuracy_score(y_val, y_pred_majority_votes.reshape([-1]))

0.869