# **EGCO 425: Chapter 4 (Decision Trees)**

## Google Colab



In [None]:
## If using Colab

from google.colab import drive
drive.mount('/content/drive')
#drive.mount('/content/drive', force_remount=True)

import os
os.chdir('/content/drive/MyDrive/Workspace/425')          ## replace Workspace/425 with your folder
%cd /content/drive/MyDrive/Workspace/425

In [None]:
import pandas as pd
import numpy as np

## Golf Data

In [None]:
GolfDF = pd.read_csv('./data/golf.csv', sep = ';')
GolfDF.info()

### Note
- Decision tree can support nominal data. But for sklearn, we need to use numeric values for categories
- Using one-hot encoding --> **[Manual: sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)**
- Using label encoding --> **[Manual: sklearn.preprocessing.LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)**

In [None]:
### (1) One-hot encoding approach

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

nominalCols = ['Outlook', 'Wind']
numericCols = ['Temperature', 'Humidity']

df = pd.DataFrame(GolfDF)

## Using one-hot encoding for attributes
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded  = ohe.fit_transform( df[nominalCols] )
features = ohe.get_feature_names_out(nominalCols)
df[features] = encoded

## Using label encoding for target
le = LabelEncoder()
df['Play'] = le.fit_transform(df['Play'])

df = df.drop(columns = nominalCols)
print(df.head())

In [None]:
### (2) Label encoding approach

from sklearn.preprocessing import LabelEncoder

nominalCols = ['Outlook', 'Wind', 'Play']
numericCols = ['Temperature', 'Humidity']

df = pd.DataFrame(GolfDF)

label_encoders  = {}       ## encoder dict
column_mappings = {}       ## mapping dict

for col in nominalCols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

    ## create mapping for current column
    mapping = {original_class: encoded_value for encoded_value, original_class in enumerate(le.classes_)}
    column_mappings[col] = mapping

print(df, "\n")
print("Column Mappings \n", column_mappings)

## Decision Tree Steps
0. Prepare data set (see above)
1. Train-test split --> **[Manual: sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)**
2. Train classifier --> **[Manual: sklearn.tree.DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)**
3. Apply the model to test set
4. Check performance metrics --> **[Manual: sklearn.metrics.classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)**, **[Manual: sklearn.metrics.confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)**, **[Manual: sklearn.metrics.ConfusionMatrixDisplay](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html)**
5. Interpret the model (tree visualization) --> **[Manual: sklearn.tree.export_text](https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_text.html)**

In [None]:
from sklearn.tree            import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

targetCol  = 'Play'
attributes = df.columns.difference([targetCol]).tolist()
X = df[attributes]                  ## attributes
y = df[targetCol]                   ## target

## (1) training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 55)
print("\n Attributes (training) \n", X_train)
print("\n Target (training) \n",     y_train)
print("\n Target (testing) \n",      y_test)


## (2) train classifier by training set
dtree = DecisionTreeClassifier(random_state = 55)
dtree.fit(X_train, y_train)


## (3) predict testing set
y_pred = dtree.predict(X_test)
print("\nPredicted = ", y_pred)
print("Accuracy  = ", metrics.accuracy_score(y_test, y_pred))


## (4) performance
cm = metrics.confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix \n", cm)
plt.figure()
display = metrics.ConfusionMatrixDisplay(cm, display_labels = np.unique(y_test))
display.plot(cmap = plt.cm.Blues)
plt.show()
print("\n Performance \n", metrics.classification_report(y_test, y_pred, digits=4))


## (5) visualize tree
tree = export_text(dtree, feature_names=attributes)
print("\n Tree")
print(tree)