In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, learning_curve

%matplotlib inline

### Forest Cover Type Dataset
This dataset includes information on trees. More specifically, the type (which is the label to predict), shadow coverage, distance to nearby landmarks (e.g., roads), soil type, and local topography. There are 7 classes of tree types, 55 features, and a total of 15120 data points.

In [None]:
forest = pd.read_csv('../Datasets/forest-cover-type.csv')
forest.shape

In [None]:
forest.head()

We then create two `ndarrays` $X$ and $y$ containing data points and labels.

In [None]:
X = forest.drop(columns=['Id', 'Cover_Type']).values
y = forest['Cover_Type'].values

In [None]:
np.unique(y, return_counts=True)

The class distribution is perfectly balanced.

We also create a train-test split with proportions $2/3-1/3$.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Next, we evaluate a tree classifier algorithm on this split.

In [None]:
learner = DecisionTreeClassifier(criterion='gini')

learner.fit(X_train, y_train)
y_pred = learner.predict(X_test)
test_score = accuracy_score(y_test,y_pred)
np.round(test_score, decimals=2)

The resulting tree classifier is quite large and deep.

In [None]:
int(learner.tree_.node_count), int(learner.tree_.max_depth)

We compute the CV estimate of the risk.

In [None]:
scores = cross_val_score(learner, X, y, cv=5)
np.round(scores.mean(), decimals=2)

Now we look at the learning curve from 1K to 9K examples.

In [None]:
sizes = range(1000, 10001, 2000)
train_size, train_score, val_score = learning_curve(learner, X, y, train_sizes=sizes, cv=3)

In [None]:
plt.title('Decision tree')
train_score_mean = np.mean(train_score, axis=1)
train_score_std = np.std(train_score, axis=1)
val_score_mean = np.mean(val_score, axis=1)
val_score_std = np.std(val_score, axis=1)
plt.grid()
plt.fill_between(train_size, train_score_mean - train_score_std,
                 train_score_mean + train_score_std, alpha=0.1,
                 color="r")
plt.fill_between(train_size, val_score_mean - val_score_std,
                 val_score_mean + val_score_std, alpha=0.1, color="g")
plt.plot(train_size, train_score_mean, 'o-', color="r",
         label="Training accuracy")
plt.plot(train_size, val_score_mean, 'o-', color="g",
         label="CV accuracy")
plt.legend()
plt.xlabel('Training size')
plt.ylabel('Accuracy')
plt.show()

Zero training error indicates that the algorithm has no bias, while the large difference between training and test performance reveals a high variance. The final cross-validated performance is only $66\%$.

In [None]:
list(np.round(np.mean(val_score, 1), decimals=2))

Let's see what happens when we constrain the depth of the tree.

In [None]:
depths = np.arange(1,33,5)
learner = DecisionTreeClassifier()
train_score, val_score = validation_curve(learner, X, y, param_name='max_depth', param_range=depths, cv=3)

In [None]:
plt.title('Tree classifier vs. depth')
plt.plot(depths, np.mean(val_score, 1), label='CV accuracy')
plt.plot(depths, np.mean(train_score, 1), label='Training accuracy')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

Below depth 5 the tree underfits. Then overfitting starts. However, the CV estimate of the accuracy does not get any worse.

### MNIST dataset
Handwritten numerals. The original dataset is already split in training (60K) and test (10K) sets. For efficiency reasons, we only work with the test set, which we further split in training and test.

Each row of the data matrix consists of 785 values: the first value is the label (a number from 0 to 9) and the remaining 784 values are the pixel values (a number from 0 to 255).

In [None]:
mnist = pd.read_csv("../Datasets/MNIST/mnist_test.csv")
mnist.head()

In [None]:
mnist.shape

We create the data matrix, the list of labels, and a train/test split with proportions $4/5-1/5$.

In [None]:
mnist_X = mnist.drop("label",1)
mnist_y = mnist["label"]
X = mnist_X.values
y = mnist_y.values

In [None]:
np.unique(y, return_counts=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In this case, it is instructive to visualize the original images from the list of pixels.

In [None]:
plt.figure(figsize=(6,6))
for digit_num in range(0,64):
    plt.subplot(8,8,digit_num+1)
    grid_data = mnist_X.iloc[digit_num].values.reshape(28,28)
    plt.imshow(grid_data, interpolation = "none", cmap = "bone_r")
    plt.xticks([])
    plt.yticks([])

We start by checking the performance of the standard tree classifier algorithm.

In [None]:
tree_lrn = DecisionTreeClassifier(criterion='gini')

tree_lrn.fit(X_train, y_train)
y_pred = tree_lrn.predict(X_test)
test_score = accuracy_score(y_test,y_pred)
np.round(test_score, decimals=2)

In [None]:
int(tree_lrn.tree_.node_count), int(tree_lrn.tree_.max_depth)

The analysis of the learning curve for the tree classifier algorithm shows essentially no bias and a large variance (see below).

In [None]:
sizes = range(1000, 6666, 1000)
train_size, train_score, val_score = learning_curve(tree_lrn, X, y, train_sizes=sizes, cv=3)

In [None]:
plt.title('Decision tree')
train_score_mean = np.mean(train_score, axis=1)
train_score_std = np.std(train_score, axis=1)
val_score_mean = np.mean(val_score, axis=1)
val_score_std = np.std(val_score, axis=1)
plt.grid()
plt.fill_between(train_size, train_score_mean - train_score_std,
                 train_score_mean + train_score_std, alpha=0.1,
                 color="r")
plt.fill_between(train_size, val_score_mean - val_score_std,
                 val_score_mean + val_score_std, alpha=0.1, color="g")
plt.plot(train_size, train_score_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_size, val_score_mean, 'o-', color="g",
         label="CV accuracy")
plt.legend()
plt.xlabel('Training size')
plt.ylabel('Accuracy')
plt.show()

In [None]:
depths = np.arange(1,21,5)
tree_lrn = DecisionTreeClassifier(criterion='gini')
train_score, val_score = validation_curve(tree_lrn, X, y, param_name='max_depth', param_range=depths, cv=3)

In [None]:
plt.title('Tree classifier vs. depth')
plt.plot(depths, np.mean(val_score, 1), label='CV accuracy')
plt.plot(depths, np.mean(train_score, 1), label='Training accuracy')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()