In [1]:
import pandas as pd
import numpy as np
import math
from itertools import chain

In [2]:
train = pd.read_csv("./all_data/train_c300_d100.csv")
valid = pd.read_csv("./all_data/valid_c300_d100.csv")
test  = pd.read_csv("./all_data/test_c300_d100.csv")

In [288]:
train.columns = valid.columns = test.columns = list(range(0, 501))

In [292]:
#lg(P)
#  return log_2(P) if P != 0, else return 0
def lg(P):
    return math.log(P, 2)

#
def Var_Impurity(K, K0, K1):
    return (K0 * K1) / K**2

def Entropy(K, K0, K1):
    if (K0 == 0 or K1 == 0):
        return 0
    P_0 = K0 / K
    P_1 = 1 - P_0
    return -P_0 * lg(P_0) - P_1 * lg(P_1)

#H = Current impurity, X_col = column being tested, S = data set, tA = target attribute, impF = impurity function
#Inf_Gain(H, X_col, S, tA, impF)
#  sum <- 0
#  unique <- unique values in X_col of S
#  for x in unique vals
#    Sn <- Values in S where X_col = x
#    occurences <- number of occurences of each unique value in tA
#    sum <- sum + entropy(occurences)
#  return H - sum
def Inf_Gain(H, X_col, S, tA, impF):
    sum = 0
    K = len(S)
    uniqVals = S[X_col].unique()
    for x in uniqVals:
        Sn  = S[S[X_col] == x]
        occurences = Sn[tA].value_counts()
        K0 = occurences[0] if (0 in occurences.index) else 0
        K1 = occurences[1] if (1 in occurences.index) else 0
        Kv = K0 + K1
        sum = sum + (Kv / K) * impF(Kv, K0, K1)
    return H - sum;

def Get_Best_Attribute(S, tA, cols, impF):
    H = impF(len(S), len(S[S[tA] == 0]), len(S[S[tA] == 1]))
    maxGain = (0, 0)
    for col in cols:
        newGain = Inf_Gain(H, col, S, tA, impF)
        maxGain = (col, newGain) if (newGain > maxGain[1])  else maxGain
    return maxGain[0]

#S = data, tA = target attribute, cols = columns to test on, 
#impF = impurity function
#Grow_Tree(S,tA, cols, impF)
#  s_uniq <- All unique classes in target attribute
#  if s_uniq has only one value return a leaf with that value
#  else
#    x_j <- Attribute name with highest gain
#    return node(x_j, 
#                Grow_Tree(S with x_j values == 0, tA, cols, impF),
#                Grow_Tree(S with x_j values == 1, tA, cols, impF))
def Grow_Tree(S, tA, cols, impF):
    s_uniq = S[tA].unique();
    if (len(s_uniq) == 1 and s_uniq[0] == 0):
        return (0)
    elif (len(s_uniq) == 1 and s_uniq[0] == 1):
        return (1)
    else:
        x_j = Get_Best_Attribute(S,tA,cols[cols != tA],impF)
        print(x_j)
        return (x_j,
                Grow_Tree(S[S[x_j] == 0], tA, cols, impF), 
                Grow_Tree(S[S[x_j] == 1], tA, cols, impF))

In [26]:
def ID3_Grow_Tree(S, tA, cols):
    node = ()
    s_all = len(S)
    s_1   = len(S[S[tA] == 1])
    s_0   = len(S[S[tA] == 0])
    
    s_uniq = S[tA].unique()
    if (len(s_uniq) == 1 and s_uniq[0] == 0):
        print("checkers")
        return 0
    elif (len(s_uniq) == 1 and s_uniq[0] == 1):
        print("checkers")
        return 1
    if (cols[cols != tA] == 0):
        print("checkers")
        return 1 if s_1 > s_0 else 0
    
    if (len(S) == 0):    
        #O(cols * max unique attributes)
        x_j = Get_Best_Attribute(S,
                                 Var_Impurity(s_all, s_0, s_1),
                                 tA,
                                 cols[cols != tA])

        #O(n * )
        node = (x_j,)
        for val in np.unique(S[x_j]):
            S_v = S[S[x_j] == val]
            if len(S_v) == 0:
                node = 1 if s_1 > s_0 else 0
            else:
                node = node + (ID3_Grow_Tree
                              (S_v,
                              tA,
                              cols[cols != x_j]),)

        return node

In [293]:
import copy

#Predict_Tree()
# While not at leaf:
#  If value has no nodes on left or right, break
#  Find attribute at current node x_j
#  Go left if value at attribute x_j == 0, otherwise go right
# pred = leaf.val
def pred_tree(tree, data):
    trav_tree = copy.copy(tree)
    atLeaf = False
    while True:
        if trav_tree == 0 or trav_tree == 1:
            break;
        if data[trav_tree[0]] == 0:
            trav_tree = trav_tree[1]
        else:
            trav_tree = trav_tree[2]
    return trav_tree
        
def compare_result(tree, data, tA):
    pred = pred_tree(tree, data)
    return pred == data[tA]

In [309]:
#Need to change nodes from splits to leafs
#Evaluate starting from nodes holding leaf nodes if they should be replaced
#Continue upwards, with next level of nodes being nodes holding the nodes that are holding leafs

#V = data set, tA = target attribute
#Reduced_Error_Pruning_Helper(V,tA, tree)
#  current_acc <- Current accuracy of tree on validation set
#  new_acc <- 0
#  if node is leaf
#    return leaf
#  if node has left leaf
#   new_left_tree = Reduced_Error_Pruning_Helper(V, tA, left_tree)
#  if node has right leaf
#   new_right_tree = Reduced_Error_Pruning_Helper(V, tA, right_tree)
#  new_acc <- accuracy with current node if tree is generalized to 0 or 1
#  if (new_acc > current_acc)
#     return most common leaf
#  else
#     return node(attribute, new left tree, new right tree)
def Reduced_Error_Pruning(V, tA, tree):
    return(Reduced_Error_Pruning_Helper(V, tA, tree, []))

def Reduced_Error_Pruning_Helper(V, tA, tree, position):
    #(O(h))
    print(len(position))
    currTree     = Traverse_Tree(tree, position)
    print(currTree[0])
    newAcc       = 0
    
    #O(1)
    if (currTree == 1 or currTree == 0):
        return currTree
    #O(n)
    if (type(currTree[1]) == tuple):
        currTree = (currTree[0], Reduced_Error_Pruning_Helper(V, tA, tree, position + [0]), currTree[2])
    if (type(currTree[2]) == tuple):
        currTree = (currTree[0], currTree[1], Reduced_Error_Pruning_Helper(V, tA, tree, position + [1]))
    
    currentAcc = Get_Tree_Accuracy(V, tA, tree)
    mcl        = Most_Common_Leaf(currTree)
    newTree    = Set_Tree(tree, mcl, position)
    newAcc     = Get_Tree_Accuracy(V, tA, tree)
    
    
    return mcl if newAcc > currentAcc else currTree
    

    
def Traverse_Tree(tree, position):
    nTree = tree
    for i in position:
        if (i == 0):
            nTree = nTree[1]
        else:
            nTree = nTree[2]
    return nTree;

def Set_Tree(tree, newSubTree, position):
    if (len(position) == 0):
        return newSubTree
    nextPos = position.pop(0)
    if (nextPos == 0):
        return (tree[0], Set_Tree(tree,newSubTree,position), tree[2])
    if (nextPos == 1):
        return (tree[0], tree[1], Set_Tree(tree,newSubTree,position))

def Most_Common_Leaf(tree):
    leafs = Get_All_Leafs(tree)
    return max(leafs, key=leafs.count)

def Get_All_Leafs(tree):
    if (type(tree) == tuple):
        return Get_All_Leafs(tree[1]) + Get_All_Leafs(tree[2])
    else:
        if (tree == 0):
            return [0]
        else:
            return [1]
    
def Get_Tree_Accuracy(D, tA, tree):
    results = D.apply(lambda instance: compare_result(tree, instance, tA), axis=1)
    return len(results[results == True])/len(results)

In [308]:
tree = Grow_Tree(train, 1, train.columns[train.columns != 0], Var_Impurity)

309
62
498
132
256
85
90
2
226
385
258
31
405
8
171
5
207
428
154
31
329
39
2
71
113
384
27
172
214
2
3


In [310]:
i = 0
Reduced_Error_Pruning(valid, 1, tree)

0
309
1
62
2
498
3
132
4
256
5
85
6
90
7
2
3
226
4
385
2
258
3
31
4
405
5
8
4
171
5
5
1
207
2
428
3
154
4
31
4
329
5
39
6
2
3
71
4
113
2
384
3
27
4
172
5
214
3
2
4
3


(309,
 (62,
  (498,
   (132, (256, (85, 0, (90, 1, (2, 0, 1))), 0), 1),
   (226, 0, (385, 1, 0))),
  (258, (31, (405, (8, 1, 0), 0), (171, 1, (5, 0, 1))), 0)),
 (207,
  (428, (154, (31, 0, 1), (329, (39, 0, (2, 1, 0)), 1)), (71, (113, 0, 1), 1)),
  (384, (27, 0, (172, (214, 0, 1), 1)), (2, 1, (3, 0, 1)))))

pandas.core.indexes.base.Index