# Decision Trees 3 (expanded)


In [12]:
from __future__ import division, unicode_literals
import math
import pandas as pd
from IPython.display import HTML, display
#import tabulate

def BigTitle(title):
    html="<div style='width: 100%; padding-bottom: 10px'><span style='font-size: 22px;font-weight:700'>" + title + "</span>"
    html +="</div>"
    display(HTML(html))
    
def EasyRead(title,text):
    html="<div style='width: 100%; padding-bottom: 10px'><span style='font-weight:700'>" + title + ":</span> &nbsp;&nbsp;"
    if type(text) is str:
        html += text
    elif type(text) is int:
        html += str(text)
    elif type(text) is float:
        html += str(round(text,5))
    else:
        html += "[ "
        for t in text:
            if type(t) is int:
                t = float(t)
            if type(t) is list:
                html += " [ "
                for tt in t:
                    html += str(round(tt,5)) + " &nbsp; "
                html += "] &nbsp; "
            else:
                html += str(round(t,5)) + " &nbsp; "
        html += " ] "
        
    html +="</div>"
    display(HTML(html))
    
BigTitle("Test")
EasyRead("title",[1,2,3.568])
EasyRead("title","text")

## Variable Names
#### L: List of values
#### N: sum of values
#### P: Probabilities
A probability is the percentage of the total N from any element in the List (l of L)<br>
&nbsp;&nbsp;&nbsp;_i.e. l/N_



In [13]:
L = [5554, 6296, 5590, 5707, 5496, 5027, 5469, 5817, 5434, 5610]
N = sum(L)*1.0
EasyRead("l of L",L)
EasyRead("N", N)

In [14]:
P = [l/N for l in L]
EasyRead("p of P", P)

## GINI Impurity
<br>
<img src="files/giniequation.png"><br>
<br>
Sum ( (probability)(1-probability) )

In [15]:
# GINI Calc
G = [p*(1-p) for p in P]

# this is the equation in the picture above
Gini = lambda Pl: sum([p * (1-p) for p in Pl])
GiniShort= lambda Pl: 1 - sum([(p**2) for p in Pl])

EasyRead("List of GINI calcs",G)
EasyRead("Sum of GINI calcs (aka the result of GINI)",sum(G))
print(GiniShort(P))
print(Gini(P))

0.8997037206632653
0.8997037206632652


# Entropy
<br>
<img src="files/entropy.png"><br>


In [16]:
# Entropy
E = [p * math.log(p, 2) for p in P]
EasyRead("List of entropy calcs",E)

Entropy = lambda Pl: -1 * sum([p * math.log(p, 2) if p else 0 for p in Pl]) 
EasyRead("Entropy of L", Entropy(L))

In [17]:
Probs = lambda Lx: [i/sum(Lx) for i in Lx]
L1 =[5371, 195, 779, 3959, 855, 3089, 1360, 3897, 540, 1770]
L2 = [183, 6101, 4811, 1748, 4641, 1938, 4109, 1920, 4894, 3840]

EasyRead("Probabilities of L",Probs(L))
EasyRead("Before: GINI of the probabilities of L",Gini(Probs(L)))
EasyRead("Sum of L1",sum(L1))
EasyRead("Sum of L2", sum(L2))
EasyRead("Sum of sum(L1) + sum(L2)", sum(L1) + sum(L2))

EasyRead("GINI of probabilities of L1", Gini(Probs(L1)))
EasyRead("GINI of probabilities of L2", Gini(Probs(L2)))

EasyRead("Before: Entropy of probabilities of L ",Entropy(Probs(L)))

EasyRead("Entropy of probabilities of L1", Entropy(Probs(L1)))
EasyRead("Entropy of probabilities of L2", Entropy(Probs(L2)))

# Information Gain
## Gain = Original Entropy - (weighted sum of split entropies/ginis)


In [18]:
def WeightedSumOfSplits(PurityType,splits,weights):
    ttl = 0
    for i in range(len(splits)):
        ttl += PurityType(Probs(splits[i]))*weights[i]
    return ttl
    
def GetWeights(LSplit,_L):
    return [sum(l)/sum(_L) for l in LSplit]

def Gain(PurityName,PurityType,splitOfL,_L):  
    originalPurity = PurityType(Probs(_L))
    weights = GetWeights(splitOfL,_L)
    
    sumOfSplits=WeightedSumOfSplits(PurityType,splitOfL,weights)
    
    Ig =  originalPurity - sumOfSplits
    
    BigTitle(PurityName)
    EasyRead("Before", originalPurity)
    EasyRead("After", sumOfSplits)
    #EasyRead("Splits", splitOfL)
    EasyRead("Weights Derived",weights)
    EasyRead("Information Gain",Ig)

Gain("GINI",Gini,[L1,L2],L)
Gain("Entropy",Entropy,[L1,L2],L)

# Restaurant Problem
<img src="files/restprob.png">

In [19]:
Restaurant = [6, 6]
Split = [ [2, 0], [0,4], [4, 2]]

Gain("Entropy",Entropy,Split,Restaurant)
Gain("GINI",Gini,Split,Restaurant)

# Some Example Table
<img src="files/exampletable2.png">

In [20]:
# Output y:  3 F, 1 T
Binary = [5, 2]

# 1 match of F, 0 mismatches of F
# 2 match of T, 2 mismatches of T
A1=[[1,0], [2, 2]]

# 2 match of F, 0 mismatches of F
# 2 match of T, 1 mismatches of T
A2=[[2,0], [2,1]]

# 2 match of F, 1 mismatches of F
# 1 match of T, 1 mismatches of T
A3=[[2,1], [1,1]]

Gain("Split on A1 using Entropy",Entropy,A1,Binary)

Gain("Split on A2 using Entropy",Entropy,A2,Binary)

Gain("Split on A3 using Entropy",Entropy,A3,Binary)

print("\n\n")

Gain("Split on A1 using GINI",Gini,A1,Binary)

Gain("Split on A2 using GINI",Gini,A2,Binary)

Gain("Split on A3 using GINI",Gini,A3,Binary)






<img src="./files/table.png">

In [21]:
# Output y:  3 F, 1 T
Binary = [5, 5]

A1=[[0,5], [5, 0]]
A2=[[3,3], [2,2]]
A3=[[2,2], [3,3]]

Gain("Split on A1 using Entropy",Entropy,A1,Binary)

Gain("Split on A2 using Entropy",Entropy,A2,Binary)

Gain("Split on A3 using Entropy",Entropy,A3,Binary)

print("\n\n")

Gain("Split on A1 using GINI",Gini,A1,Binary)

Gain("Split on A2 using GINI",Gini,A2,Binary)

Gain("Split on A3 using GINI",Gini,A3,Binary)




