In [111]:
import time
import logging
import numpy as np
import pandas as pd


from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score


In [112]:
total_class = 10

In [113]:
def log(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        logging.debug('start %s()' % func.__name__)
        ret = func(*args, **kwargs)
        
        end_time= time.time()
        logging.debug('end %s(), cost %s seconds' % (func.__name__, end_time-start_time))
        
        return ret
    return wrapper

In [114]:
if __name__ == '__main__':
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    
    raw_data = pd.read_csv('/Users/guozhiqi-seven/Documents/Statistical Learning/Decision Tree/train.csv',header=0) 
    data = raw_data.values
    
    img = data[:,1:]
    labels = data[:,0]   

In [115]:
#直接用二值化之后的dataset (cv2 under python3 is available)
features = np.loadtxt('features.out')

In [136]:
#2/3 Training set
#1/3 Testing  set
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.33)

In [137]:
class Tree(object):
    def __init__(self,node_type,Class=None,feature=None):
        self.node_type = node_type
        self.Class = Class
        self.feature = feature
        self.dict = {}  #不需要构造新的数据类型来存储决策树，使用字典dict即可方便的存储节点信息
    def add_tree(self,val,tree):
        self.dict[val]=tree
    def predict(self,features):
        if self.node_type == 'leaf':
            return self.Class
        tree = self.dict[features[self.feature]]
        
        return tree.predict(features)       

In [138]:
def calc_entropy(x):
    '''
    calculate entropy
    x: labels
    '''
    
    #H(X) = -sum(p_i * log(p_i))
    x_value_list = set([x[i] for i in range(x.shape[0])])
    entropy = 0.0
    
    for x_value in x_value_list:
        p = float(x[x==x_value].shape[0]) / x.shape[0]
        log_p = np.log2(p)
        
        entropy -= p*log_p
    
    return entropy #return啊你倒是return啊你妹...

In [139]:
def calc_condition_entropy(x,y):
    '''
    conditional entropy H(y|x)
    
    x:feature
    y:label
    '''
    x_value_list = set([x[i] for i in range(x.shape[0])])
    entropy = 0.0
    
    for x_value in x_value_list:
        sub_y = y[x==x_value]
        temp_entropy = calc_entropy(sub_y)
        entropy += (float(sub_y.shape[0]) / y.shape[0]) * temp_entropy
    
    return entropy #return啊你倒是return啊你妹...
       

In [140]:
def calc_entropy_g(x,y):
    '''
    g(D,A) = H(D) - H(D|A)
    '''
    base_entropy = calc_entropy(y)
    condition_entropy = calc_condition_entropy(x,y)
    
    entropy_g = base_entropy - condition_entropy
    
    return entropy_g

In [141]:
@log
def train(train_set,train_label,features,epsilon):
    return recurse_train(train_set,train_label,features,epsilon) 

In [142]:
def recurse_train(train_set,train_label,features,epsilon):
    global total_class
    
    LEAF = 'leaf'
    INTERNAL = 'internal'
    
    #Step1: 如果train_set所有实例属于同一类C_k
    label_set = set(train_label)
    
    if len(label_set) == 1:
        return Tree(LEAF,Class = label_set.pop())
    
    #Step2: 如果feature为空
    (max_class,max_len)=max([(i,len(filter(lambda x: x==i, train_label))) for i in xrange(total_class)],
                            key=lambda x:x[1])
    if len(features) == 0:
        return Tree(LEAF, Class = max_class)
    
    #Step3: calculate entropy
    max_feature = 0
    max_gda = 0 #g(D,A)
    
    D = train_label
    H_D = calc_entropy(D)
    
    for feature in features:
        #A = np.array(features[:,feature].flat)
        A = np.array(train_set[:,feature].flat)
        gda = H_D - calc_condition_entropy(A,D)
        
        if gda > max_gda:
            max_gda = gda
            max_feature = feature
    
    #Step4:entropy 小于阈值的情况
    if max_gda < epsilon:
        return Tree(LEAF,Class = max_class)
    
    #Step5:构建非空子集
    sub_features = filter(lambda x:x!= max_feature, features)
    tree = Tree(INTERNAL, feature=max_feature)
    
    feature_col = np.array(train_set[:,max_feature].flat) #max feature
    features_value_list = set([feature_col[i] for i in range(feature_col.shape[0])]) #信息增益最大特征A_g的每一可能值a_i
    
    for feature_value in features_value_list:
        index = []
        for i in xrange(len(train_label)):
            if train_set[i][max_feature] == feature_value:
                index.append(i)
        
        sub_train_set= train_set[index]
        sub_train_label = train_label[index]
        
        sub_tree = recurse_train(sub_train_set,sub_train_label,sub_features,epsilon)
        tree.add_tree(feature_value,sub_tree)
        
    return tree  
    

In [143]:
@log
def predict(test_set,tree):
    result = []
    
    for feature in test_set:
        temp_prediction = tree.predict(feature)
        result.append(temp_prediction)
    
    return np.array(result)

In [144]:
tree = train(train_features,train_labels,[i for i in range(784)],0.1)
test_predict = predict(test_features,tree)
score = accuracy_score(test_labels,test_predict)

print "The accruacy socre is ", score

DEBUG:root:start train()
DEBUG:root:end train(), cost 198.132023096 seconds
DEBUG:root:start predict()
DEBUG:root:end predict(), cost 0.181843996048 seconds


The accruacy socre is  0.859812409812
