In [39]:
# ===============================================================
#
#                         Tree Definition
#
# ===============================================================
class TreeNode:
    def __init__(self, attrName):
        self.attr = attrName
        self.son = dict()
        self.isLeaf = False
        self.leafLabel = None

    def getAttr(self):
        return self.attr

    def setAttr(self, attrName):
        self.attr = attrName

    def addSon(self, attrVal, subTree):
        self.son[attrVal] = subTree

    def getAllSon(self):
        return list(self.son.values())

    def getSonByAttrVal(self, attrVal):
        return self.son[attrVal]

    def getCurrentAttrDivision(self):
        return list(self.son.keys())

    def setLeaf(self, label):
        self.isLeaf = True
        self.leafLabel = label
        self.attr = str(label)


class Tree:
    def __init__(self, rootNode):
        self.root = rootNode

    def add(self, nextAttrName, currentVal):
        if self.root == None:
            print("warning: attempt to add a subtree to a null tree")
            return
        subTree = Tree(TreeNode(nextAttrName))
        self.root.addSon(currentVal, subTree)

    def printTree(self):
        if self.root == None:
            return
        if self.root.isLeaf:
            print("Reach Leaf:%s" % self.root.leafLabel)
            return
        print("Node %s has sons:" % (self.root.getAttr()))

        for subtree in self.root.getAllSon():
            if subtree.root == None:
                continue
            print(">" + subtree.root.getAttr())
        subtrees = self.root.getAllSon()
        for subtree in subtrees:
            subtree.printTree()
        if subtrees == []:
            print(">None")
        print("--------------")

    def deleteTree(self):
        self.root = None

In [40]:
# ===============================================================
#
#                         Construct Tree v2
#
# ===============================================================
import math
def Ent(Dataset):
    labels = list(set(Dataset['label']))
    nrow = float(len(Dataset))
    def ratioItem(label):
        ratio= (float(sum(Dataset['label'] == label)) / nrow)
        if abs(ratio)<1e-5:
            return 0
        else:
            return ratio*math.log(ratio,2)
    entropy = - sum(map(ratioItem, labels))
    return entropy

def Gain(Dataset, attrName):
    gain = Ent(Dataset)
    nrow = float(len(Dataset))

    subDataset = Dataset[Dataset[attrName] == 0]
    nrowSub = float(len(subDataset))
    gain -= Ent(subDataset) * nrowSub/nrow

    subDataset = Dataset[Dataset[attrName] == 1]
    nrowSub = float(len(subDataset))
    gain -= Ent(subDataset) * nrowSub/nrow

    return gain

def attrSelection(Dataset, Attributes, cheatMode=False):
    if cheatMode == True:
        Dataset = Dataset.sample(frac=0.3)

    def fucktion(attr):
        return Gain(Dataset, str(attr))

    optAttr = max(Attributes, key=fucktion)
    return (optAttr, Gain(Dataset, str(optAttr)))
# =========================================================
# =========================================================

def identicalRows(df):
    for rowID in range(len(df) - 1):
        if not df.iloc[rowID, 1:].equals(df.iloc[rowID + 1, 1:]):
            return False
    return True


def treeGeneration(Dataset, Attributes):
    labels = Dataset['label']

    # 1. 生成空节点node
    node = TreeNode(attrName="unset")
    # tree = Tree(rootNode=node)

    # 2. Boundary Case 1
    if len(set(labels)) == 1:  # 如果Dataset中的样本全属于同一类别C
        C = labels.iloc[0]
        node.setLeaf(label=C)  # 将node标记成C类叶子节点
        tree = Tree(rootNode=node)
        return tree

    # 3. Boundary Case 2
    # 找到Dataset中样本数最多的类
    try:
        C = pd.Series.mode(labels)[0]
    except KeyError:
        C = labels.iloc[0]
    except IndexError:
        C = labels.iloc[0]
    print("most label=%d"%C)
    if Attributes == [] or identicalRows(Dataset.iloc[:, Attributes]):  # 如果Attributes=∅,或者Dataset在Attributes的这几个属性上取值相同
        node.setLeaf(label=C)  # 将node标记成叶子节点，其类别是Dataset中样本数最多的类
        tree = Tree(rootNode=node)
        return tree

    # 3.5 pre剪枝 (myYY)
    # if len(Dataset) < 21:
    #     node.setLeaf(label=C)  # 将node标记成叶子节点，其类别是Dataset中样本数最多的类
    #     tree = Tree(rootNode=node)
    #     return tree

    # 4. 选择最优划分属性: optAttr
    t1 = time.time()
    optAttr, gainVal = attrSelection(Dataset, Attributes, cheatMode=False)
    t2 = time.time()
    print("selected attr: %d with gain=%f,   size:%d, taking %f secs" % (optAttr,gainVal, len(Dataset), t2 - t1))
    sys.stdout.flush()
    node.setAttr(str(optAttr))

    # 4.5 剪枝 太小的基尼系数直接标成叶子节点，已经比较纯了
    if gainVal < 0.003:
        node.setLeaf(label=C)  # 将node标记成叶子节点，其类别是Dataset中样本数最多的类
        tree = Tree(rootNode=node)
        return tree

    # 5. 为optAttr属性的每一个值
    for optAttrVal in [0, 1]:
        subDataset = Dataset[Dataset[str(optAttr)] == optAttrVal]
        if len(subDataset) == 0:  # 为node加上一个分支，分支为叶子节点，叶子节点类别标记为Dataset中样本数最多的类
            leafNode = TreeNode(attrName="")
            leafNode.setLeaf(label=C)
            leafNodeTree = Tree(rootNode=leafNode)
            node.addSon(attrVal=optAttrVal, subTree=leafNodeTree)
        else:
            reducedAttributes = copy.deepcopy(Attributes)
            reducedAttributes.remove(optAttr)
            # import pdb;pdb.set_trace();
            sonTree = treeGeneration(Dataset=subDataset, Attributes=reducedAttributes)
            node.addSon(attrVal=optAttrVal, subTree=sonTree)

    tree = Tree(rootNode=node)
    return tree

In [45]:
# ===============================================================
#
#                      Train Data and Test Data
#
# ===============================================================
import sys
import copy
import time
import scipy.io
import random
rawData = scipy.io.loadmat('Sogou_data/Sogou_webpage.mat')
docLabels   = rawData['doclabel']
wordVectors = rawData['wordMat']
docCnt=len(docLabels)
wordCnt =wordVectors.shape[1]


import numpy as np
import pandas as pd
df_wordVectors = pd.DataFrame(data=docLabels, columns=['label'])
df_docLabels   = pd.DataFrame(wordVectors,columns=[str(x) for x in range(1,1+wordCnt)])
cleanData = pd.concat([df_wordVectors, df_docLabels], axis=1)


labelCnt =len(set(cleanData['label']))


train = cleanData.sample(frac=0.008)
test =  cleanData.drop(train.index)

print("train size=%d"%len(train))
print("test size=%d"%len(test))





train size=115
test size=14285


In [57]:
import random
# 取80%数据训练，留20%数据测试
train = cleanData.sample(frac=0.8) 
test =  cleanData.drop(train.index)

random.seed(1)
train1 = train.sample(frac=0.2)
random.seed(2)
train2 = train.sample(frac=0.2)
random.seed(3)
train3 = train.sample(frac=0.2)
random.seed(4)
train4 = train.sample(frac=0.2)
random.seed(5)
train5 = train.sample(frac=0.2)
random.seed(6)
train6 = train.sample(frac=0.2)
random.seed(7)
train7 = train.sample(frac=0.2)
random.seed(8)
train8 = train.sample(frac=0.2)
random.seed(9)
train9 = train.sample(frac=0.2)
random.seed(10)
train10= train.sample(frac=0.2)

import pickle
fp = open('train1.dataframe.pkl', 'wb')
pickle.dump(train1, fp)
fp.close()

fp = open('train2.dataframe.pkl', 'wb')
pickle.dump(train2, fp)
fp.close()

fp = open('train3.dataframe.pkl', 'wb')
pickle.dump(train3, fp)
fp.close()

fp = open('train4.dataframe.pkl', 'wb')
pickle.dump(train4, fp)
fp.close()

fp = open('train5.dataframe.pkl', 'wb')
pickle.dump(train5, fp)
fp.close()

fp = open('train6.dataframe.pkl', 'wb')
pickle.dump(train6, fp)
fp.close()

fp = open('train7.dataframe.pkl', 'wb')
pickle.dump(train7, fp)
fp.close()

fp = open('train8.dataframe.pkl', 'wb')
pickle.dump(train8, fp)
fp.close()

fp = open('train9.dataframe.pkl', 'wb')
pickle.dump(train9, fp)
fp.close()

fp = open('train10.dataframe.pkl', 'wb')
pickle.dump(train10, fp)
fp.close()



In [46]:
# ===============================================================
#
#                         Train Model
#
# ===============================================================

trainBegin=time.time()
dct1=treeGeneration(Dataset=train, Attributes=list(range(1,1+1200)))
trainEnd = time.time()
print("train finished, taking %f secs"%(trainEnd-trainBegin))
dct1.printTree()

most label=4
selected attr: 473 with gain=0.438428,   size:115, taking 6.540391 secs
most label=7
selected attr: 368 with gain=0.480262,   size:96, taking 5.942796 secs
most label=1
selected attr: 384 with gain=0.391291,   size:82, taking 5.610421 secs
most label=1
selected attr: 26 with gain=0.343265,   size:70, taking 5.322189 secs
most label=1
selected attr: 51 with gain=0.480147,   size:42, taking 4.039757 secs
most label=9
selected attr: 13 with gain=0.618110,   size:28, taking 3.240665 secs
most label=9
selected attr: 192 with gain=0.571212,   size:16, taking 2.389350 secs
most label=9
selected attr: 5 with gain=0.466663,   size:13, taking 2.065490 secs
most label=4
selected attr: 4 with gain=0.918296,   size:6, taking 1.914090 secs
most label=5
selected attr: 8 with gain=1.000000,   size:2, taking 1.378168 secs
most label=4
selected attr: 39 with gain=1.000000,   size:4, taking 1.434312 secs
most label=6
selected attr: 1 with gain=0.918296,   size:3, taking 1.490462 secs
most la

In [47]:
# ===============================================================
#
#                         Predictor Definition
#
# ===============================================================
def predict(decisionTree, Dataset):
    predicted= []
    for rowID in range(len(Dataset)):
        DataItem = Dataset.iloc[rowID,:]
        currNode = decisionTree.root
        while not currNode.isLeaf:
            judgeAttrName=currNode.getAttr()
            judgeVal = DataItem[judgeAttrName]
            currNode = currNode.getSonByAttrVal(judgeVal).root
        predicted.append(currNode.leafLabel)
        #print(rowID)
        #sys.stdout.flush()
    return predicted


res=predict(dct1, test)
correct=list(test['label'].values)
accuracy=sum([res[labelID]==correct[labelID] for labelID in range(len(res))])/len(correct)
print(accuracy)

0.421841092055


ImportError: No module named 'cPickle'

In [None]:
import json
json.dump