In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from treeutils import *
seed=12390481
np.random.seed(seed)

# Tree ensemble for recognizing handwritten digits

In [None]:
data = datasets.load_digits(as_frame=True)
df = data['frame']

In [None]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(data['data'].values, data['target'].values, test_size=0.2,random_state=seed)

In [None]:
num_trees = 150
Ntrain = Xtrain.shape[0]
Nvalid = Xvalid.shape[0]

tree_acc = np.empty(num_trees)
yhat = np.empty((num_trees,Nvalid))

trees = list()
for i in range(num_trees):

    # bootstrap sample the training data
    ind = np.random.choice(range(Ntrain),Ntrain)
    X_bs = Xtrain[ind,:]
    y_bs = ytrain[ind]

    # train a new decision tree
    tree = DecisionTreeClassifier().fit(X_bs,y_bs)
    trees.append(tree)

    # evaluate the tree
    yhat[i,:] = tree.predict(Xvalid)
    tree_acc[i] = accuracy_score(yvalid,yhat[i,:])

In [None]:
# evaluate the ensemble
y_ensemble = stats.mode(yhat, keepdims=True).mode[0]   
acc_ens = accuracy_score(yvalid,y_ensemble)

# compare accuracies
print(f"Average individual accuracy: {tree_acc.mean():.2f}")
print(f"Accuracy of the ensemble: {acc_ens:.2f}")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
plt.hist(tree_acc,bins=10,color='gray')
plt.axvline(tree_acc.mean(),color='r')
plt.axvline(acc_ens,linestyle='--',color='m')
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.yticks(fontsize=0)
plt.xticks(fontsize=14);

# Random forests in scikit-learn

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=150, oob_score=True)
model.fit(Xtrain, ytrain)

# Compute accuracy on the test data
yhat = model.predict(Xvalid)
acc_valid = accuracy_score(yvalid,yhat)

# Compare validation accuracy to OOB accuracy
print(f"Validation accuracy: {acc_valid:.2f}")
print(f"OOB accuracy: {model.oob_score_:.2f}")

In [None]:
model.feature_importances_ 

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
ax.stem(model.feature_importances_)
ax.spines[:].set_visible(False)
ax.set_yticks([])
ax.tick_params(axis='x',labelsize=14)

In [None]:
plt.figure(figsize=(4,4))
plt.imshow(model.feature_importances_.reshape((8,8)))
plt.xticks([])
plt.yticks([])

### Iterate over number of trees

In [None]:
num_trees = np.arange(100,300,10)
acc_rf = np.empty(len(num_trees))
oob_rf = np.empty(len(num_trees))
for i, ntrees in enumerate(num_trees):
    model = RandomForestClassifier(n_estimators=150, oob_score=True)
    model.fit(Xtrain, ytrain)
    acc_rf[i] = accuracy_score( yvalid, model.predict(Xvalid) )
    oob_rf[i] = model.oob_score_

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
plt.plot(num_trees, acc_rf, linewidth=3, label='Validation accuracy')
plt.plot(num_trees, oob_rf, linewidth=3, label='OOB accuracy')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(linestyle='--')
plt.xlabel('Number of trees', fontsize=16)
plt.legend(fontsize=16, loc='lower right')
plt.xlim(90,300)
plt.ylim(0.8,1)
ax.spines[:].set_visible(False)