In [1]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler("DT-prepruning.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [2]:
import pandas as pd
import numpy as np

attr_dict = {"age":0, "workclass":1, "fnlwgt":0, "education":1, "education-num":0, "marital-status":1, "occupation":1, "relationship":1, "race":1, "sex":1, "capital-gain":0, "capital-loss":0, "hours-per-week":0, "native-country":1, "salary":0} # 0: continuous, 1: discrete

train_data = pd.read_csv("adult.data",names=attr_dict.keys(),index_col=False)
test_data = pd.read_csv("adult.test",names=attr_dict.keys(),index_col=False,header=0)

def preprocessing(data):
    attributes = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country","salary"]
    # data['capital-net'] = data[['capital-gain', 'capital-loss']].apply(lambda x: x[0]-x[1], axis=1)
    # attr_dict['capital-net'] = 0
    # attributes = list(attr_dict.keys())
    # attributes.remove("fnlwgt")
    # attributes.remove("capital-gain")
    # attributes.remove("capital-loss")
    return data[attributes]

# fill in ?
def fill_data(data):
    for a in attr_dict:
        if attr_dict[a]: # discrete
            data.loc[data[a] == " ?",a] = data[a].value_counts().argmax() # view or copy? Use loc!
        else: # continuous
            pass
            # print(a_col.value_counts())
            # a_col[a_col == " ?"] = np.mean(a_col[a_col != " ?"])

train_data = preprocessing(train_data)
test_data = preprocessing(test_data)
fill_data(train_data)
fill_data(test_data)

cut = int(0.9 * len(train_data))
# cut = int(len(train_data))
train_data, validation_data = train_data[:cut], train_data[cut:]

In [3]:
def entropy(p):
    if p.ndim == 1:
        new_p = p[p != 0]
        return -np.sum(new_p * np.log2(new_p))
    else:
        # new_p = p[(p[:,0] != 0) & (p[:,1] != 0)]
        return -np.sum(p * np.log2(p),axis=1)

def information_gain(D,a,discrete_flag=False):
    pk = D["salary"].value_counts(normalize=True).values
    if discrete_flag:
        prop_Dv = D[a].value_counts(normalize=True).values # proportion
        prob_Dv = np.array([D.loc[D[a] == av]["salary"].value_counts(normalize=True).get(" >50K",0) for av in D[a].unique()])
        pp_stack = np.column_stack((prop_Dv,prob_Dv))
        pp_stack = pp_stack[(pp_stack[:,1] != 0) & (pp_stack[:,1] != 1)]
        prop_Dv = pp_stack[:,0]
        prob_Dv = pp_stack[:,1]
        prob_Dv_neg = 1 - prob_Dv
        return (entropy(pk) - np.sum(prop_Dv * entropy(np.column_stack((prob_Dv,prob_Dv_neg)))), a)
    else:
        a_sort = sorted(D[a].unique())
        Ta = [(a_sort[i] + a_sort[i+1]) / 2 for i in range(len(a_sort)-1)]
        min_ent, min_t = 0x3f3f3f3f, a_sort[0]
        for t in Ta: # bi-partition
            prop_Dv = len(D[D[a] < t]) / len(D)
            prop_Dv = np.array([prop_Dv,1-prop_Dv])
            prob_Dv_smaller = D[D[a] < t]["salary"].value_counts(normalize=True).get(" >50K",0)
            prob_Dv_bigger = D[D[a] >= t]["salary"].value_counts(normalize=True).get(" >50K",0)
            prob_Dv = np.array([[prob_Dv_smaller,1-prob_Dv_smaller],[prob_Dv_bigger,1-prob_Dv_bigger]])
            prob_Dv = prob_Dv[(prob_Dv[:,0] != 0) & (prob_Dv[:,1] != 0)]
            if len(prob_Dv) == 0:
                sumup = 0
            else:
                sumup = np.sum(prop_Dv * entropy(prob_Dv))
            if min_ent > sumup:
                min_ent = sumup
                min_t = t
        return (entropy(pk) - min_ent, min_t)

cnt = 0
class Node:

    def __init__(self):
        self.branch = {}

    def setLeaf(self,catagory):
        global cnt
        cnt += 1
        logger.info("{} - Create leaf: {}".format(cnt,catagory))
        if cnt % 100 == 0:
            print("{} - Create leaf: {}".format(cnt,catagory),flush=True)
        self.label = "Leaf"
        self.catagory = catagory
        
    def setBranch(self,attr,value,node,branch_value=None):
        logger.info("Create branch: {} ({})".format(attr,value))
        self.label = "Branch"
        self.attr = attr
        self.branch[value] = node
        if branch_value != None:
            self.branch_value = branch_value

In [4]:
import time,sys

class ID3:

    def __init__(self,train_set=None,validation_set=None,test_set=None,attr_dict=None):
        self.train_set = train_set
        self.validation_set = validation_set
        self.test_set = test_set
        self.attr_dict = attr_dict

    def TreeGenerate(self,dataset,attributes,root=None):
        catagory = dataset["salary"].unique()
        node = Node() if root == None else root
        if len(catagory) == 1:
            node.setLeaf(catagory[0])
            return node
        if len(attributes) == 0 or np.sum([len(dataset[a].unique()) for a in attributes]) == len(attributes):
            node.setLeaf(dataset["salary"].value_counts().argmax())
            return node
        # without partition
        node.setLeaf(dataset["salary"].value_counts().argmax())
        acc_without_partition = self.validation()
        # with partition
        max_gain = (-0x3f3f3f3f,None)
        for a in attributes:
            gain = information_gain(dataset,a,self.attr_dict[a])
            if gain[0] > max_gain[0]:
                a_best, max_gain = a, gain
        if self.attr_dict[a_best]: # discrete
            for av in self.train_set[a_best].unique(): # be careful, not dataset!
                Dv = dataset[dataset[a_best] == av]
                leafnode = Node()
                leafnode.setLeaf(dataset["salary"].value_counts().argmax())
                node.setBranch(a_best,av,leafnode)
        else:
            for flag in ["Smaller","Bigger"]:
                Dv = dataset[dataset[a_best] < max_gain[1]] if flag == "Smaller" else dataset[dataset[a_best] >= max_gain[1]]
                leafnode = Node()
                leafnode.setLeaf(dataset["salary"].value_counts().argmax())
                node.setBranch(a_best,flag,leafnode,branch_value=max_gain[1])
        acc_with_partition = self.validation()
        if acc_without_partition > acc_with_partition: # pre-pruning
            node.setLeaf(dataset["salary"].value_counts().argmax())
            return node
        # true partition
        if self.attr_dict[a_best]: # discrete
            for av in self.train_set[a_best].unique(): # be careful, not dataset!
                Dv = dataset[dataset[a_best] == av]
                if len(Dv) != 0:
                    node.setBranch(a_best,av,self.TreeGenerate(Dv,attributes[attributes != a_best]))
        else:
            for flag in ["Smaller","Bigger"]:
                Dv = dataset[dataset[a_best] < max_gain[1]] if flag == "Smaller" else dataset[dataset[a_best] >= max_gain[1]]
                if len(Dv) != 0:
                    node.setBranch(a_best,flag,self.TreeGenerate(Dv,attributes),branch_value=max_gain[1])
        # acc_with_partition = self.validation()
        # if acc_without_partition > acc_with_partition: # post-pruning
        #     node.setLeaf(dataset["salary"].value_counts().argmax())
        return node

    def train(self,train_set=None):
        if train_set != None:
            self.train_set = train_set
        start_time = time.time()
        self.root = Node()
        self.root = self.TreeGenerate(self.train_set,self.train_set.columns.values[self.train_set.columns.values != "salary"],self.root)
        logger.info("Time: {:.2f}s".format(time.time()-start_time))
        print("Time: {:.2f}s".format(time.time()-start_time))

    def validation(self,validation_set=None):
        if validation_set != None:
            self.validation_set = validation_set
        acc = 0
        for i,row in self.validation_set.iterrows():
            p = self.root
            while p.label != "Leaf":
                if self.attr_dict[p.attr]: # discrete
                    p = p.branch[row[p.attr]]
                else: # continuous
                    p = p.branch["Smaller"] if row[p.attr] < p.branch_value else p.branch["Bigger"]
            if p.catagory == row["salary"]:
                acc += 1
        acc /= len(self.validation_set)
        return acc

    def test(self,test_set=None):
        if test_set != None:
            self.test_set = test_set
        acc = 0
        for i,row in self.test_set.iterrows():
            p = self.root
            while p.label != "Leaf":
                if self.attr_dict[p.attr]: # discrete
                    p = p.branch[row[p.attr]]
                else: # continuous
                    p = p.branch["Smaller"] if row[p.attr] < p.branch_value else p.branch["Bigger"]
            if p.catagory == row["salary"][:-1]: # be careful of "."
                acc += 1
        acc /= len(self.test_set)
        logger.info("Accurary: {:.2f}%".format(acc * 100))
        print("Accurary: {:.2f}%".format(acc * 100))
        return acc

In [5]:
dt = ID3(train_set=train_data,validation_set=validation_data,test_set=test_data,attr_dict=attr_dict)
dt.train()
dt.test()

100 - Create leaf:  <=50K
200 - Create leaf:  <=50K
300 - Create leaf:  <=50K
400 - Create leaf:  <=50K
500 - Create leaf:  <=50K
600 - Create leaf:  <=50K
700 - Create leaf:  <=50K
800 - Create leaf:  <=50K
900 - Create leaf:  <=50K
1000 - Create leaf:  <=50K
1100 - Create leaf:  <=50K
1200 - Create leaf:  <=50K
1300 - Create leaf:  <=50K
1400 - Create leaf:  <=50K
1500 - Create leaf:  <=50K
1600 - Create leaf:  <=50K
1700 - Create leaf:  <=50K
1800 - Create leaf:  >50K
1900 - Create leaf:  <=50K
2000 - Create leaf:  <=50K
2100 - Create leaf:  <=50K
2200 - Create leaf:  <=50K
2300 - Create leaf:  <=50K
2400 - Create leaf:  <=50K
2500 - Create leaf:  <=50K
2600 - Create leaf:  <=50K
2700 - Create leaf:  <=50K
2800 - Create leaf:  <=50K
2900 - Create leaf:  <=50K
3000 - Create leaf:  <=50K
3100 - Create leaf:  <=50K
3200 - Create leaf:  <=50K
3300 - Create leaf:  <=50K
3400 - Create leaf:  <=50K
3500 - Create leaf:  <=50K
3600 - Create leaf:  <=50K
3700 - Create leaf:  <=50K
3800 - Crea

0.8133407038879675