# 7. Train and fine-tune a Decision Tree for the moons dataset

In [1]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone # deep copy of the model in an estimator without actually copying attached data.
from scipy.stats import mode

In [2]:
# a. Generate a moons dataset using make_moons(n_samples=10000, noise=0.4)
X,y = make_moons(n_samples=10000, noise=0.4, random_state=42)

# b. Split it into a training set and a test set using train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [3]:
tree_clf = DecisionTreeClassifier(random_state=42)

# c. Use grid search with cross-validation
# Hint: try various values for max_leaf_nodes.
grid_params = {"max_leaf_nodes": list(range(2, 100)), "max_depth" : [2,3,4]}
grid_search = GridSearchCV(tree_clf, grid_params, verbose = 1, cv =3)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 882 out of 882 | elapsed:    5.9s finished


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=4,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [4]:
# d. Measure your model’s performance on the test set.
best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
accuracy_score(y_test, preds)

0.863

# 8. Grow a forest

In [5]:
# a. Generate 1,000 subsets of the training set,
# each containing 100 instances selected randomly.
# Hint: you can use Scikit-Learn’s ShuffleSplit class for this.
n_trees = 1000
n_instances = 100

subs = []
split_shuff = ShuffleSplit(n_splits=n_trees,
                           test_size=len(X_train) - n_instances,
                           random_state=42)

for mini_index, mini_index_test in split_shuff.split(X_train):
    X_mini_train = X_train[mini_index]
    y_mini_train = y_train[mini_index]
    subs.append((X_mini_train, y_mini_train))

In [6]:
# b. Train one Decision Tree on each subset, using the best hyperparameter values
# Evaluate these 1,000 Decision Trees on the test set

forest = [clone(best_model) for _ in range(n_trees)]
accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, subs):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    
np.mean(accuracy_scores)

0.831607

In [7]:
# c. For each test set instance, generate the predictions of the 1,000 DTs,
y_preds = np.empty([n_trees, len(X_test)], dtype=np.uint8)
for tree_idx, tree in enumerate(forest):
    y_preds[tree_idx] = tree.predict(X_test)

# and keep only the most frequent prediction, with SciPy’s mode() funciton
majority_vote_pred, n_votes = mode(y_preds, axis=0)

In [8]:
# d. Evaluate these predictions on the test set
accuracy_score(y_test, majority_vote_pred.reshape([-1]))


0.869