In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import sys
import pprint


def create_data():
    with open('../Data/zhengqi/zhengqi_train.txt') as fr:
         dataset = pd.read_csv(fr, sep='\t')
    
    '''
    with open('zhengqi_test.txt') as fr_test:
    data_test = pd.read_csv(fr_test, sep='\t')
    '''
    
    return dataset

In [12]:
def print_node(node, depth=1):  
    if node.splitting_name is None:
        print(depth, (node.splitting_name, node.splitting_value, node.c))
    else:
        print(depth, (node.splitting_name, node.splitting_value))
        for ch in node.child:
            print_node(ch, depth+1)

In [13]:
class Node:
    def __init__(self, splitting_name=None, splitting_value=None, c=None):
        '''
        leaf: splitting_name --> None
              splitting_value --> None
              child --> None
              c --> Not None
        -------------------------------
        others: splitting_name --> Not None
                splitting_value --> Not None
                child --> Not None
                c --> Not None
        '''
        self.splitting_name = splitting_name
        self.splitting_value = splitting_value
        self.child = []
        self.c = c
    
    def add_node(self, node):
        self.child.append(node)
        
    def predict(self, test_data):
        if self.splitting_value != None:    # not the leaf nodes
            if self.splitting_value < test_data[self.splitting_name]:
                return self.child[0].predict(test_data)
            else:
                return self.child[1].predict(test_data)
        else:
            return round(self.c, 3)

In [14]:
class DTree:
    def __init__(self):
        self.tree = Node()
        
    def mse(self, r, c):
        y = np.array(r['target'])
        error = 0
        for i in y:
            error += np.power((i - c), 2)

        return error
    
    def get_best_c(self, r):
        y = r['target']
        return np.mean(y)
    
    def train(self, dataset, node, max_depth, depth):
        train_data = dataset.iloc[:, 0:-1]
        
        if train_data.shape[0] == 0:
            return
        
        child1 = Node()
        child2 = Node()
        node.add_node(child1)
        node.add_node(child2)

        error_sum = float('inf')
        for feature_name in train_data.columns.values:
            for feature_value in train_data[feature_name]:
                r1_list = [rows for index, rows in dataset.iterrows() 
                                if rows[feature_name] < feature_value]
                
                r2_list = [rows for index, rows in dataset.iterrows() 
                                if rows[feature_name] >= feature_value]
                
                r1 = pd.DataFrame(r1_list, columns=dataset.columns)
                c1 = self.get_best_c(r1)
                error1 = self.mse(r1, c1)
                
                r2 = pd.DataFrame(r2_list, columns=dataset.columns)
                c2 = self.get_best_c(r2)
                error2 = self.mse(r2, c2)
                
                if (error1 + error2) < error_sum:
                    error_sum = error1 + error2
                    node.splitting_name = feature_name
                    node.splitting_value = feature_value
                    child1.c = c1
                    child2.c = c2
        
        #print(depth, node.splitting_name, node.splitting_value)
        r1 = r1.drop(node.splitting_name, axis=1)
        r2 = r2.drop(node.splitting_name, axis=1)
        
        if depth == max_depth:
            return
             
        self.train(r1, child1, max_depth, depth+1)
        self.train(r2, child2, max_depth, depth+1)
        
        return 

            
    def fit(self, dataset, max_depth):
        initial_depth = 1
        self.train(dataset, self.tree, max_depth, initial_depth)
        
    def pruning(self):
        pass
    
    def predict(self, test_data):
        test_data = test_data.drop('target', axis=1)
        labels = test_data.columns.values
        result = []
        for index, row in test_data.iterrows():
            result.append(self.tree.predict(row))
            
        return result  

In [15]:
dataset = create_data()
rf = DTree()
max_depth = 8
rf.fit(dataset.head(50), max_depth)
print('=============================')
print_node(rf.tree)

1 V0 -0.14300000000000002
2 V10 0.073
3 V1 0.602
4 V2 1.5
5 V3 0.40700000000000003
6 V4 0.452
7 V5 -0.9009999999999999
8 V6 -1.8119999999999998
5 V3 -1.6
6 V4 0.631
7 V5 -1.757
8 V6 -0.37
4 V2 1.66
5 V3 -1.131
6 V4 0.114
7 V5 -0.9279999999999999
8 V6 0.221
3 V1 0.642
4 V4 0.332
5 V5 -1.579
6 V6 -0.02
7 V7 -0.9159999999999999
8 V9 -0.821
2 V8 -0.812
3 V27 -0.032
4 V1 -0.486
5 V37 0.315
6 V31 -0.467
7 V4 0.04
8 V2 -0.304
8 V2 0.28300000000000003
7 V2 0.235
8 V3 0.435
1 ('V0', -0.14300000000000002)
2 ('V10', 0.073)
3 ('V1', 0.602)
4 ('V2', 1.5)
5 ('V3', 0.40700000000000003)
6 (None, None, nan)
6 ('V4', 0.452)
7 (None, None, nan)
7 ('V5', -0.9009999999999999)
8 (None, None, nan)
8 ('V6', -1.8119999999999998)
9 (None, None, nan)
9 (None, None, 0.175)
5 ('V3', -1.6)
6 (None, None, nan)
6 ('V4', 0.631)
7 (None, None, nan)
7 ('V5', -1.757)
8 (None, None, nan)
8 ('V6', -0.37)
9 (None, None, nan)
9 (None, None, 0.977)
4 ('V2', 1.66)
5 (None, None, 0.11599999999999999)
5 ('V3', -1.131)
6 (None, N

In [14]:
test_data = dataset.iloc[116:121]
result = rf.predict(test_data)
print(result)
print(np.array(test_data['target']))

[-0.802, 0.96, 0.116, 0.116, 0.116]
[-0.072  0.486  0.311  0.628  0.944]


In [None]:
dataset.head()

In [21]:
for i in range(5):
    print(i)

0
1
2
3
4
