In [14]:
# Exercise 7
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=10000, noise=0.4)
X

array([[ 2.07125455,  0.25183136],
       [ 1.3960118 , -1.2022476 ],
       [-1.17899216,  0.6440481 ],
       ...,
       [ 1.53504904, -0.0767426 ],
       [ 1.37359227, -1.12989649],
       [ 0.39872715,  0.34575446]])

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

param_grid = {'max_depth':[6,7,8,9],
              'min_samples_split':[2,3,4],
              'max_leaf_nodes':range(10,30)
              }

# min_samples_split=2, min_samples_leaf=1, 
# max_features=None, random_state=None, max_leaf_nodes=None, 
# min_impurity_decrease=0.0, min_impurity_split=None, 

model = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, 
             scoring='accuracy', n_jobs=-1, cv=3, refit=True, verbose=2)

model.fit(X_train, y_train)
print(model.best_params_)

print('train accuracy: ' + str(model.best_score_))
print('test accuracy: ' + str(accuracy_score(y_test , model.predict(X_test))))

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    0.8s


{'max_depth': 7, 'max_leaf_nodes': 15, 'min_samples_split': 2}
train accuracy: 0.8597501362592243
test accuracy: 0.843


[Parallel(n_jobs=-1)]: Done 685 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    1.2s finished


In [64]:
# Exercise 8
import numpy as np
from scipy.stats import mode
from sklearn.model_selection import ShuffleSplit
n_splits=1000
rs = ShuffleSplit(n_splits=n_splits, train_size=100, random_state=42)

forest, acc_scores = [], []
y_preds = np.zeros((n_splits, X_test.shape[0]))
for i, (train_index, _) in enumerate(rs.split(X_train)):
    tree = DecisionTreeClassifier(**model.best_params_)
    tree.fit(X_train[train_index], y_train[train_index])
    forest.append(tree)
    y_preds[i, :] = tree.predict(X_test)
    acc_scores.append(accuracy_score(y_test , model.predict(X_test)))
    
print('Average accuracy of the individual scores for each tree: ' + 
      str(np.average(acc_scores)))

mode = mode(y_preds).mode.reshape(-1)
print('Accuracy using the mode of all tree predictions together: ' + 
      str(accuracy_score(y_test , mode)))

Average accuracy of the individual scores for each tree: 0.843
Accuracy using the mode of all tree predictions together: 0.8565
