In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mode

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Fine-tuning Decision Tree for moons dataset

Train and fine-tune a decision tree for the moons dataset by following
these steps:

a. Use make_moons(n_samples=10000, noise=0.4) to generate a
moons dataset.

b. Use train_test_split() to split the dataset into a training set and a test
set.

c. Use grid search with cross-validation (with the help of the
GridSearchCV class) to find good hyperparameter values for a
DecisionTreeClassifier. Hint: try various values for
max_leaf_nodes.

d. Train it on the full training set using these hyperparameters, and
measure your model’s performance on the test set. You should get
roughly 85% to 87% accuracy.

In [None]:
# Creating instance of decision tree
tree_clf = DecisionTreeClassifier()

In [None]:
# Creating moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Creating parameters grid for GridSearchCV (fine-tuning)
param_grid = {
    'max_depth': (3, 5, 7),
    'max_leaf_nodes': (5, 7, 9),
    'criterion': ('gini', 'entropy')
}

In [None]:
# Using GridSearchCV to fine-tune the decision tree model
grid = GridSearchCV(tree_clf, param_grid, scoring='accuracy', cv=3)
grid.fit(X_train, y_train)

In [None]:
# Getting the best parameters
grid.best_params_

{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5}

In [None]:
# Recreating the best model and using it on test set
best_tree = DecisionTreeClassifier(criterion='gini', max_depth=3, max_leaf_nodes=5)
best_tree.fit(X_train, y_train)

predictions = best_tree.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

Accuracy: 0.859


# Random Forest for moons dataset

Grow a forest by following these steps:

a. Continuing the previous exercise, generate 1,000 subsets of the
training set, each containing 100 instances selected randomly. Hint:
you can use Scikit-Learn’s ShuffleSplit class for this.

b. Train one decision tree on each subset, using the best
hyperparameter values found in the previous exercise. Evaluate
these 1,000 decision trees on the test set. Since they were trained on
smaller sets, these decision trees will likely perform worse than the
first decision tree, achieving only about 80% accuracy.

c. Now comes the magic. For each test set instance, generate the
predictions of the 1,000 decision trees, and keep only the most
frequent prediction (you can use SciPy’s mode() function for this).
This approach gives you majority-vote predictions over the test set.

d. Evaluate these predictions on the test set: you should obtain a
slightly higher accuracy than your first model (about 0.5 to 1.5%
higher). Congratulations, you have trained a random forest
classifier!

In [None]:
# Creating moons dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# ShuffleSplit to split the dataset into 1000 subsets
split = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)

# Create 1000 subsets and store them in subsets[]
subsets = []
for train_idx, _ in split.split(X_train):
    X_subset, y_subset = X_train[train_idx], y_train[train_idx]
    subsets.append((X_subset, y_subset))

In [None]:
# Create 1000 decision trees and store them in trees[]
trees = []
for X_subset, y_subset in subsets:
    tree = DecisionTreeClassifier(criterion='gini', max_depth=3, max_leaf_nodes=5)
    tree.fit(X_subset, y_subset)
    trees.append(tree)

In [None]:
# Getting accuracy score of a single decision tree
prediction = trees[0].predict(X_test)
print(f'Accuracy score: {accuracy_score(y_test, prediction)}')

Accuracy score: 0.796


In [None]:
# Collect predictions from all trees (forest) and extract the mode
predictions = np.array([tree.predict(X_test) for tree in trees])
final_prediction = mode(predictions, axis=0).mode

# Getting the mode from predictions and calculating accuracy score
print(f'Accuracy score: {accuracy_score(y_test, final_prediction)}')

Accuracy score: 0.867
