# Decision Trees

### 0) Sample Data

In [2]:
import pandas as pd
data = {
    'color' : ['Purple', 'Purple', 'Yellow', 'Green', 'Green'],
    'size'  : [1       , 1.5     , 3       , 3      , 3      ],
    'fruit' : ['Grape' , 'Grape' , 'Lemon' , 'Apple', 'Lemon']
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,color,fruit,size
0,Purple,Grape,1.0
1,Purple,Grape,1.5
2,Yellow,Lemon,3.0
3,Green,Apple,3.0
4,Green,Lemon,3.0


### 1) Split the Data
Trees work by "asking questions" and spliting the data accordingly.
The objective is to have a node with the purest data in it

In [33]:
class Condition:
    """
    A Condition is used to partition a dataset.
     
    This class just records a column name (e.g., Color) and a
    column value (e.g., Green)
    
    Numerical questions are always >=
    """
    
    def __init__(self, column_name, value):
        self.column_name = column_name
        self.value = value

    def __str__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if isinstance(self.value, int) or isinstance(self.value, float):
            condition = ">="
        return "Is %s %s %s?" % (self.column_name, condition, str(self.value))

In [34]:
def partition(df, condition):
    
    # if question is numeric
    if isinstance(condition.value, int) or isinstance(condition.value, float):
        true_branch  = df[df[condition.column_name] >= condition.value]
        false_branch = df[df[condition.column_name] <  condition.value]
    else:
        true_branch  = df[df[condition.column_name] == condition.value]
        false_branch = df[df[condition.column_name] != condition.value]

    return true_branch, false_branch
    

In [35]:
### Tests
cond = Condition('color', 'Green')
print(cond)

Is color == Green?


In [36]:
green, not_green = partition(df, cond)

In [37]:
green

Unnamed: 0,color,fruit,size
3,Green,Apple,3.0
4,Green,Lemon,3.0


In [38]:
not_green

Unnamed: 0,color,fruit,size
0,Purple,Grape,1.0
1,Purple,Grape,1.5
2,Yellow,Lemon,3.0


### 2) Calculate impurity of node and information gain on split

In [40]:
classes = df['fruit'].unique()
classes

array(['Grape', 'Lemon', 'Apple'], dtype=object)

In [None]:
df['fruit' == ]

In [None]:
def class_counts(df, column_name):
    

In [None]:
def gini(df, column_name):
    
    

In [49]:
for i in range(0, 150, 5):
    print("i = %s, P = %s" % (i, (100*100)/((25+i)*(25+i))*i))

i = 0, P = 0.0
i = 5, P = 55.55555555555556
i = 10, P = 81.63265306122449
i = 15, P = 93.75
i = 20, P = 98.76543209876543
i = 25, P = 100.0
i = 30, P = 99.17355371900827
i = 35, P = 97.22222222222221
i = 40, P = 94.67455621301775
i = 45, P = 91.83673469387756
i = 50, P = 88.88888888888889
i = 55, P = 85.9375
i = 60, P = 83.04498269896193
i = 65, P = 80.24691358024691
i = 70, P = 77.5623268698061
i = 75, P = 75.0
i = 80, P = 72.56235827664399
i = 85, P = 70.24793388429752
i = 90, P = 68.05293005671078
i = 95, P = 65.97222222222221
i = 100, P = 64.0
i = 105, P = 62.130177514792905
i = 110, P = 60.35665294924554
i = 115, P = 58.673469387755105
i = 120, P = 57.07491082045184
i = 125, P = 55.55555555555555
i = 130, P = 54.11030176899063
i = 135, P = 52.734375
i = 140, P = 51.423324150596876
i = 145, P = 50.173010380622834
