# 🏛 Class 7: Decision Trees 🏛

One of the most mature, traditional algorithms in predictive analytics. Most likely used for **classification problems**; used when features are **categorical**

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
# Calculate entropy for a fair coin
def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H

p = [.5, .5]
entropy(p)

1.0

In [11]:
# Faith's function to calculate entropy
def calculate_entropy(probabilities):
    entropy = 0
    for prob in probabilities: 
        entropy += (np.log2(prob) * -prob)
    return entropy

print(calculate_entropy([0.5, 0.5]))
print(calculate_entropy([0.9, 0.1]))
print(calculate_entropy([0.1, 0.9]))

1.0
0.4689955935892812
0.4689955935892812


### The maximum entropy returns us the lowest possible depth for our decision tree.

In [9]:
df = pd.read_csv('tennis.txt', delimiter='\t', header=None, names=['weather', 'temp', 'humidity', 'wind', 'play'])

df

Unnamed: 0,weather,temp,humidity,wind,play
1,Sunny,Hot,High,Weak,No
2,Sunny,Hot,High,Strong,No
3,Overcast,Hot,High,Weak,Yes
4,Rain,Mild,High,Weak,Yes
5,Rain,Cool,Normal,Weak,Yes
6,Rain,Cool,Normal,Strong,No
7,Overcast,Cool,Normal,Strong,Yes
8,Sunny,Mild,High,Weak,No
9,Sunny,Cool,Normal,Weak,Yes
10,Rain,Mild,Normal,Weak,Yes


In [12]:
# probability that play == 'No' is 5/14
# probability that play == 'Yes' is 9/14

# pass in these probabilities to entropy function to calculate
print(calculate_entropy([(5/14), (9/14)]))

0.9402859586706311


In [13]:
# What's the probability that play == 'Yes' given that the wind is weak?
# 8 total answers: 2 no, 6 yes

df[df.wind == 'Weak']

Unnamed: 0,weather,temp,humidity,wind,play
1,Sunny,Hot,High,Weak,No
3,Overcast,Hot,High,Weak,Yes
4,Rain,Mild,High,Weak,Yes
5,Rain,Cool,Normal,Weak,Yes
8,Sunny,Mild,High,Weak,No
9,Sunny,Cool,Normal,Weak,Yes
10,Rain,Mild,Normal,Weak,Yes
13,Overcast,Hot,Normal,Weak,Yes


In [14]:
print(calculate_entropy([(2/8), 6/8]))

0.8112781244591328


In [15]:
df[df.wind == 'Strong']

Unnamed: 0,weather,temp,humidity,wind,play
2,Sunny,Hot,High,Strong,No
6,Rain,Cool,Normal,Strong,No
7,Overcast,Cool,Normal,Strong,Yes
11,Sunny,Mild,Normal,Strong,Yes
12,Overcast,Mild,High,Strong,Yes
14,Rain,Mild,High,Strong,No


In [16]:
print(calculate_entropy([(3/6), 3/6]))

1.0


### How to calculate Mutual Information / Information Gain

(entropy of 'play') - ([entropy of 'play' when 'wind' == 'Weak'] * [probability that 'wind' == Weak'] + [entropy of 'play' when 'wind' == 'Strong' * probability that 'wind' == 'Strong'])

0.94 - (0.81 * (8/14) + 1.0 * (6/14))

= 0.04857142857142849

### The feature with the highest value for information gain is the root of our decision tree!

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus

['weather', 'temp', 'humidity', 'wind', 'play']

data = pd.read_csv('tennis.txt', delimiter='\t', header=None, names=['weather', 'temp', 'humidity', 'wind', 'play'])
df

Unnamed: 0,weather,temp,humidity,wind,play
1,Sunny,Hot,High,Weak,No
2,Sunny,Hot,High,Strong,No
3,Overcast,Hot,High,Weak,Yes
4,Rain,Mild,High,Weak,Yes
5,Rain,Cool,Normal,Weak,Yes
6,Rain,Cool,Normal,Strong,No
7,Overcast,Cool,Normal,Strong,Yes
8,Sunny,Mild,High,Weak,No
9,Sunny,Cool,Normal,Weak,Yes
10,Rain,Mild,Normal,Weak,Yes


In [18]:
data_encoded = df.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

    weather  temp  humidity  wind  play
1         2     1         0     1     0
2         2     1         0     0     0
3         0     1         0     1     1
4         1     2         0     1     1
5         1     0         1     1     1
6         1     0         1     0     0
7         0     0         1     0     1
8         2     2         0     1     0
9         2     0         1     1     1
10        1     2         1     1     1
11        2     2         1     0     1
12        0     2         0     0     1
13        0     1         1     1     1
14        1     2         0     0     0


In [19]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

clf.fit(data_encoded[['weather', 'temp', 'humidity', 'wind']], data_encoded['play'])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')