# Decision Trees Learning Activity
This notebook implements decision trees.  It uses the Parkinsons dataset from the UCI Machine Learning Repository:

https://archive.ics.uci.edu/ml/datasets/Parkinsons

In [None]:
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

## Data

In [None]:
# load parkinsons dataset
data = pd.read_csv('parkinsons.data')
data.drop(['name'], axis=1, inplace=True)
y = data['status']
X = data.drop(['status'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

## Original Tree

In [None]:
orig_tree_clf = tree.DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

In [None]:
print('Depth of the classification tree: ', orig_tree_clf.get_depth())

In [None]:
# display the tree
fig, ax = plt.subplots(figsize=(20,20))
tree.plot_tree(orig_tree_clf, 
               feature_names = X_test.columns, 
               class_names = ['Healthy', 'Parkinsons'],
               filled = True,
               fontsize = 10,
               impurity = False,
               ax=ax);

In [None]:
tree_accuracy = accuracy_score(y_test, orig_tree_clf.predict(X_test))
print('Classification accuracy: ', tree_accuracy)

In [None]:
# confusion matrix
y_pred = orig_tree_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels=['Healthy', 'Parkinsons'])
fig, ax = plt.subplots(figsize=(5,5))
cm_display.plot(ax=ax, xticks_rotation='vertical')

In [None]:
tree_importances = pd.Series(orig_tree_clf.feature_importances_, index=X.columns)
fig, ax = plt.subplots(figsize=(8,8))
tree_importances.sort_values().plot.barh(ax=ax)
ax.set_title('Feature Importance')

## Learning Activity 1

Change the data removed from the training set and observe the response of the tree.

In [None]:
# change number
N_remove = 10
idx_remove = np.random.randint(0, X_train.shape[0], N_remove)

X_mod = X_train.drop(idx_remove).reset_index(drop=True)
y_mod = y_train.drop(idx_remove).reset_index(drop=True)

In [None]:
tree_clf = tree.DecisionTreeClassifier(random_state=0).fit(X_mod, y_mod)

In [None]:
print('Depth of the classification tree: ', tree_clf.get_depth())

In [None]:
# display the tree
fig, ax = plt.subplots(figsize=(20,20))
tree.plot_tree(tree_clf, 
               feature_names = X_test.columns, 
               class_names = ['Healthy', 'Parkinsons'],
               filled = True,
               fontsize = 10,
               impurity = False,
               ax=ax);

Evaluate on test set

In [None]:
tree_accuracy = accuracy_score(y_test, tree_clf.predict(X_test))
print('Classification accuracy: ', tree_accuracy)

In [None]:
# confusion matrix
y_pred = tree_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels=['Healthy', 'Parkinsons'])
fig, ax = plt.subplots(figsize=(5,5))
cm_display.plot(ax=ax, xticks_rotation='vertical')

Feature Importance

In [None]:
tree_importances = pd.Series(tree_clf.feature_importances_, index=X.columns)
fig, ax = plt.subplots(figsize=(8,8))
tree_importances.sort_values().plot.barh(ax=ax)
ax.set_title('Feature Importance')

## Learning Activity 2

Prune the tree using different values and observe the results

In [None]:
alphas = tree.DecisionTreeClassifier(random_state=0).cost_complexity_pruning_path(X_train, y_train)

In [None]:
alphas

In [None]:
pruned_tree = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=alphas['ccp_alphas'][3]).fit(X_train, y_train)

In [None]:
# display pruned tree
fig, ax = plt.subplots(figsize=(20,20))
tree.plot_tree(pruned_tree, 
               feature_names = X_test.columns, 
               class_names = ['Healthy', 'Parkinsons'],
               filled = True,
               fontsize = 10,
               impurity = False,
               ax=ax);

In [None]:
pruned_accuracy = accuracy_score(y_test, pruned_tree.predict(X_test))
print('Classification accuracy: ', pruned_accuracy)

In [None]:
# confusion matrix
y_pred = pruned_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels=['Healthy', 'Parkinsons'])
fig, ax = plt.subplots(figsize=(5,5))
cm_display.plot(ax=ax, xticks_rotation='vertical')

## Learning Activity 3

Rerun with different train/test splits and observe the response.