In [1]:
import pandas as pd
import numpy as np
import math
from itertools import chain

In [2]:
train = pd.read_csv("./all_data/train_c300_d100.csv")
valid = pd.read_csv("./all_data/valid_c300_d100.csv")
test  = pd.read_csv("./all_data/test_c300_d100.csv")

In [3]:
train.columns = valid.columns = test.columns = list(range(0, 501))

In [4]:
#lg(P)
#  return log_2(P) if P != 0, else return 0
def lg(P):
    return math.log(P, 2)

#
def Var_Impurity(K, K0, K1):
    return (K0 * K1) / K**2

def Entropy(K, K0, K1):
    if (K0 == 0 or K1 == 0):
        return 0
    P_0 = K0 / K
    P_1 = 1 - P_0
    return -P_0 * lg(P_0) - P_1 * lg(P_1)

#H = Current impurity, X_col = column being tested, S = data set, tA = target attribute, impF = impurity function
#Inf_Gain(H, X_col, S, tA, impF)
#  sum <- 0
#  unique <- unique values in X_col of S
#  for x in unique vals
#    Sn <- Values in S where X_col = x
#    occurences <- number of occurences of each unique value in tA
#    sum <- sum + entropy(occurences)
#  return H - sum
def Inf_Gain(H, X_col, S, tA, impF):
    sum = 0
    K = len(S)
    uniqVals = S[X_col].unique()
    for x in uniqVals:
        Sn  = S[S[X_col] == x]
        occurences = Sn[tA].value_counts()
        K0 = occurences[0] if (0 in occurences.index) else 0
        K1 = occurences[1] if (1 in occurences.index) else 0
        Kv = K0 + K1
        sum = sum + (Kv / K) * impF(Kv, K0, K1)
    return H - sum;

def Get_Best_Attribute(S, tA, cols, impF):
    H = impF(len(S), len(S[S[tA] == 0]), len(S[S[tA] == 1]))
    maxGain = (0, 0)
    for col in cols:
        newGain = Inf_Gain(H, col, S, tA, impF)
        maxGain = (col, newGain) if (newGain > maxGain[1])  else maxGain
    return maxGain[0]

#S = data, tA = target attribute, cols = columns to test on, 
#impF = impurity function
#Grow_Tree(S,tA, cols, impF)
#  s_uniq <- All unique classes in target attribute
#  if s_uniq has only one value return a leaf with that value
#  else
#    x_j <- Attribute name with highest gain
#    return node(x_j, 
#                Grow_Tree(S with x_j values == 0, tA, cols, impF),
#                Grow_Tree(S with x_j values == 1, tA, cols, impF))
def Grow_Tree(S, tA, cols, impF):
    s_uniq = S[tA].unique();
    if (len(s_uniq) == 1 and s_uniq[0] == 0):
        return (0)
    elif (len(s_uniq) == 1 and s_uniq[0] == 1):
        return (1)
    else:
        x_j = Get_Best_Attribute(S,tA,cols[cols != tA],impF)
        print(x_j)
        return (x_j,
                Grow_Tree(S[S[x_j] == 0], tA, cols, impF), 
                Grow_Tree(S[S[x_j] == 1], tA, cols, impF))

In [5]:
import copy

#Predict_Tree()
# While not at leaf:
#  If value has no nodes on left or right, break
#  Find attribute at current node x_j
#  Go left if value at attribute x_j == 0, otherwise go right
# pred = leaf.val
def pred_tree(tree, data):
    trav_tree = copy.copy(tree)
    atLeaf = False
    while True:
        if trav_tree == 0 or trav_tree == 1:
            break;
        if data[trav_tree[0]] == 0:
            trav_tree = trav_tree[1]
        else:
            trav_tree = trav_tree[2]
    return trav_tree
        
def compare_result(tree, data, tA):
    pred = pred_tree(tree, data)
    return pred == data[tA]

In [25]:
#Need to change nodes from splits to leafs
#Evaluate starting from nodes holding leaf nodes if they should be replaced
#Continue upwards, with next level of nodes being nodes holding the nodes that are holding leafs

#V = data set, tA = target attribute
#Reduced_Error_Pruning_Helper(V,tA, tree)
#Idea of algorithm is to reach the bottom nodes with leafs below them, then
#evaluate the performance benefit of replacing that node with the most common sub leaf.
#If this improves the accuracy, replace that tree node with 
def Reduced_Error_Pruning(V, tA, tree):
    treec = copy.copy(tree)
    return Reduced_Error_Pruning_Helper(V, tA, treec, ())

def Reduced_Error_Pruning_Helper(V, tA, tree, position):
    currTree = Traverse_Tree(tree, position)
    treeC    = copy.copy(tree)
    
    if (currTree == 1 or currTree == 0):
        return treeC
    #Change the sub components back to treeC
    #Issue is right now, the left side is done first and affects right side results
    if (type(currTree[1]) == tuple):
        treeC = Reduced_Error_Pruning_Helper(V, tA, tree, position + (0,))
    if (type(currTree[2]) == tuple):
        treeC = Reduced_Error_Pruning_Helper(V, tA, tree, position + (1,))
    
    currentAcc = Get_Tree_Accuracy(V, tA, treeC)
    mcl        = Most_Common_Leaf(treeC)
    newTree    = Set_Tree(treeC, mcl, copy.copy(position))
    newAcc     = Get_Tree_Accuracy(V, tA, newTree)
    print(newAcc, currentAcc)
    
    return newTree if newAcc >= currentAcc else treeC

def Traverse_Tree(tree, position):
    nTree = copy.copy(tree)
    for i in position:
        if (i == 0):
            nTree = nTree[1]
        else:
            nTree = nTree[2]
    return nTree;

def Set_Tree(tree, newSubTree, position):
    if (len(position) == 0):
        return newSubTree
    nextPos = position[0]
    position = position[1:]
    if (nextPos == 0):
        return (tree[0], Set_Tree(tree[1],newSubTree,position), tree[2])
    if (nextPos == 1):
        return (tree[0], tree[1], Set_Tree(tree[2],newSubTree,position))

def Most_Common_Leaf(tree):
    leafs = Get_All_Leafs(tree)
    return max(leafs, key=leafs.count)

def Get_All_Leafs(tree):
    if (type(tree) == tuple):
        return Get_All_Leafs(tree[1]) + Get_All_Leafs(tree[2])
    else:
        if (tree == 0):
            return [0]
        else:
            return [1]
    
def Get_Tree_Accuracy(D, tA, tree):
    results = D.apply(lambda instance: compare_result(tree, instance, tA), axis=1)
    return len(results[results == True])/len(results)

In [7]:
tree = Grow_Tree(train, 1, train.columns[train.columns != 0], Entropy)

309
62
498
132
256
85
90
2
226
385
258
31
405
8
171
5
207
428
154
31
329
39
2
71
113
384
27
172
214
2
3


In [26]:
nTree = Reduced_Error_Pruning(valid, 1, tree)

0.4371859296482412 0.46733668341708545
0.4623115577889447 0.46733668341708545
0.47738693467336685 0.46733668341708545
0.47738693467336685 0.47738693467336685
0.4472361809045226 0.47738693467336685
0.4824120603015075 0.46733668341708545
0.4824120603015075 0.4824120603015075
0.4623115577889447 0.4824120603015075
0.4623115577889447 0.46733668341708545
0.4623115577889447 0.46733668341708545
0.4723618090452261 0.46733668341708545
0.4623115577889447 0.46733668341708545
0.4623115577889447 0.46733668341708545
0.46733668341708545 0.46733668341708545
0.4623115577889447 0.46733668341708545
0.5728643216080402 0.46733668341708545


In [29]:
Get_Tree_Accuracy(train, 1, nTree)

0.507537688442211

In [12]:
Most_Common_Leaf(nTree)

0

In [19]:
tree = (tree[0], 0, tree[2])