In [2]:
import pandas as pd
import sys
import math
import numpy as np
import graphviz
from IPython.display import display, HTML

def table_to_file(table, filename):
    table.to_csv(filename, sep="\t", index=False)

In [5]:
class Node(object):
    def __init__(self, nodeid, attribute, attr_type, threshold):
        self.id = nodeid
        self.attr = attribute
        self.attr_type = attr_type
        self.thres = threshold
        self.left = None
        self.right = None
        self.leaf = False
        self.predict = None
        self.corr = None
        self.count = None       


def calc_pearson_corr(df, target_attrs):
    df_t1 = df[target_attrs[0]]
    df_t2 = df[target_attrs[1]]
    cor = df_t1.corr(df_t2, method = 'pearson')
    return cor

def corr_gain(df, parent_corr, parent_N, attribute, attr_type, target_attrs, threshold):
    if(attr_type == 'continuous'):
        sub_1 = df[df[attribute] >= threshold]
        sub_2 = df[(df[attribute] < threshold) | (df[attribute].isnull())]
    else:
        sub_1 = df[df[attribute] == threshold]
        sub_2 = df[(df[attribute] != threshold)]
        
    cg = np.max([(calc_pearson_corr(sub_1, target_attrs)-parent_corr)
                * sub_1.shape[0]/parent_N, (calc_pearson_corr(sub_2, target_attrs) - 
                                           parent_corr) * sub_2.shape[0]/parent_N])
    return cg

def select_threshold_cont(df, parent_corr, parent_N, attribute, attr_type, predict_attr):
    values = df[attribute].tolist()
    values = set(values)
    values = list(values)
    values.sort()
    
    max_ig = float("-inf")
    gain_exist = False
    thres_val = 0
    for i in range(0, len(values)-1):
        thres - values[i]
        ig = corr_gain(df, parent_corr, parent_N, attribute, attr_type, predict_attr, thres)
        if ig > mag_ig:
            gain_exist = True
            max_ig - ig
            thres_val = thres
    
    if (not gain_exist): print('No corr gain - (attr:{})'.format(attribute))
    return thres_val, gain_exist

def select_threshold_catg(df, parent_corr, parent_N, attribute, attr_type, predict_attr):
    values = df[attribute].tolist()
    values = set(values)
    values = list(values)
    values.sort()
    
    max_ig = float("-inf")
    gain_exist = False
    thres_val = 0
    
    for i in range(0, len(values)-1):
        thres = values[i]
        ig = corr_gain(df, parent_corr, parent_N, attribute, attr_type, predict_attr, thres )
        if ig > max_ig:
            gain_exist = True
            max_ig = ig
            thres_val = thres
    
    if (not gain_exist): print('No corr gain - (attr: {})'.format(attribute))
    return thres_val, gain_exist

def select_threshold(df, parent_corr, parent_N, attribute, attr_type, predict_attr):
    if(attr_type == 'continuous'): thres_vale, gain_exist = select_threshold_cont(df, parent_corr, parent_N, attribute, attr_type, predict_attr)
    else: thres_vale, gain_exist = select_threshold_catg(df, parent_corr, parent_N, attribute, attr_type, predict_attr)
    return thres_val, gain_exist

def choose_attr(df, attributes, attr_Types, predict_attr):
    parent_corr = calc_pearson_corr(df, predict_attr)
    parent_N = df.shape[0]
    max_info_gain = float("-inf")
    best_attr = None
    threshold = 0
    for attr in attributes:
        attr_type = attr_types.loc[attr].values[0]
        thres, gain_exist = select_theshold(df, parent_corr, parent_N, attr, attr_Type, predict_attr)
        if(not gain_exist): continue
            
        ig = corr_gain(df, parent_corr, parent_N, attr, attr_type, thres)
        if ig > max_info_gain:
            max_info_gain = ig
            best_attr = attr
            threshold = thres
    
    return best_attr, threshold

def build_tree(df, cols, attr_types, predict_attr):
    global nodeid
    nodeid = nodeid+1
    if(df.shape[0] <= 10): # leaf node 결정 조건 설정
        leaf = Node(nodeid, None, None, None)
        leaf.leaf = True
        leaf.corr = calc_pearson_corr(df, predict_attr)
        leaf.count = df.shape[0]
        return leaf
    else:
        best_attr, threshold = choose_attr(df, cols, attr_Types, predict_attr)
        if(best_attr == None):
            print('best attr is none, search end.')
            leaf = Node(nodeid, None, None, None)
            leaf.leaf = True
            leaf.corr = calc_pearson_corr(df, predict_attr)
            leaf.count = df.shape[0]
            return leaf
        best_attr_type = attr_types.loc[best_attr].values[0]
        tree = Node(nodeid, bests_attr, best_attr_type, thresold)
        tree.corr = calc_pearson_corr(df, predict_attr)
        tree.count = df.shape[0]
        print('expand tree : {} ~ threshold : {}'.format(best_attr, thresold))
        
        if(best_attr_type == 'continuous'):
            sub_1 = df[df[best_attr] >= threshold]
            sub_2 = df[(df[best_attr] < threshold) | (df[best_attr].isnull())]
        else:
            sub_1 = df[df[best_attr] == threshold]
            sub_2 = df[df[best_attr] != threshold]
            
        tree.left = build_tree(sub_1, cols, attr_types, predict_attr)
        tree.right = build_tree(sub_2, cols, attr_types, predict_attr)
        return tree
    
        
def print_tree(strlist, root, level):
    if root.leaf:
        info = '{} [label={} ncorr = {}, fillcolor="#4ca6e8"]'.format(root.id, root.count, round(root.corr,2))
    else:
        if(root.attr_type == 'continuous'):
            info = '{} [label={} >= {} nn = {} ncorr = {}, fillcolor"#7bbeee"]'.format(root.id, root.attr, round(root.thres,2), root.count, round(root.corr,2))
        else:
            info = '{} [label={} == {} nn = {} ncorr = {}, fillcolor"#7bbeee"]'.format(root.id, root.attr, round(root.thres,2), root.count, round(root.corr,2))
    
    strlist.append(info)
    
    if(root.left):
        connect = '{} -> {}'.format(root.id, root.left.id)
        strlist.append(connect)
        if(root.id ==1): strlist.append(' [labeldistance = 2.5, labelangle = 45, headlabel = "True"]')
        else: strlist.append(';')
        print_tree(strlist, root.left, level+1)
    if(root.right):
        connect = '{} -> {}'.format(root.id, root.left.id)
        strlist.append(connect)
        if(root.id ==1): strlist.append(' [labeldistance = 2.5, labelangle = 45, headlabel = "False"]')
        else: strlist.append(';')
        print_tree(strlist, root.right, level+1)

In [7]:
strlist=[]
strlist.append('digraph Tree{')
strlist.append('node [shape=box, style="filled", color="black"]')
print_tree(strlist, root, 0)
strlist.append('}')

dot_str = '\n'.join(strlist)
graphviz.Source(dot_str)
        
        

NameError: name 'root' is not defined

In [6]:
nodeid = 0
target_attrs = ['TPP','Breathrate']
attr_Types = pd.read_csv('attr_types.txt', sep='\t', index_col='attr_name')
df_train = clean('horseTrain.txt', attr_types, target_attrs)
attributes=['K','Na', 'CL','HCO','Endotoxin','Anioingap','PLA2',]
root = build_tree(df_train, attributes, attr_types, target_attrs)

NameError: name 'sstrlist' is not defined