In [89]:
class TreeNode:
    def __init__(self, attrName):
        self.attr=attrName
        self.son= dict()
        self.isLeaf=False
        self.leafLabel=None
        
    def getAttr(self):
        return self.attr
    
    def setAttr(self, attrName):
        self.attr=attrName
    
    def addSon(self, attrVal, subTree):
        self.son[attrVal]=subTree
    
    def getAllSon(self):
        return list(self.son.values())
    
    def getSonByAttrVal(self, attrVal):
        return self.son[attrVal]
    
    def getCurrentAttrDivision(self):
        return list(self.son.keys())
    
    def setLeaf(self, label):
        self.isLeaf=True
        self.leafLabel=label
        
        
class Tree:
    def __init__(self, rootNode):
        self.root=rootNode
        
    def add(self,nextAttrName,currentVal):
        if self.root==None:
            print("warning: attempt to add a subtree to a null tree")
            return
        subTree=Tree(TreeNode(nextAttrName))
        self.root.addSon(currentVal,subTree)
            
    def printTree(self):
        if self.root==None:
            return
        if self.root.isLeaf:
            print("Reach Leaf:%s"%self.root.leafLabel)
            return
        print("Node %s has sons:"%(self.root.getAttr()))
        
        for subtree in self.root.getAllSon():
            if subtree.root==None:
                continue
            print(">"+subtree.root.getAttr())
        subtrees=self.root.getAllSon()
        for subtree in subtrees:
            subtree.printTree()
        if subtrees==[]:
            print(">None")
        print("--------------")
        
    def deleteTree(self):
        self.root=None
        
t=Tree(TreeNode("纹理"))
t.add(nextAttrName="根蒂", currentVal=1)
t.add(nextAttrName="触感", currentVal=2)
t.add(nextAttrName="颜色", currentVal=3)
print(t.root.getCurrentAttrDivision())


s1=t.root.getSonByAttrVal(1)
s1.add(nextAttrName="leaf", currentVal=1)
s1.add(nextAttrName="色泽", currentVal=2)
s1.add(nextAttrName="leaf", currentVal=3)
s1.root.getSonByAttrVal(1).root.setLeaf("好瓜")
s1.root.getSonByAttrVal(3).root.setLeaf("坏瓜")
#t.printTree()

print("\n\n&&&&&&&&&&&&&&&&&&&&&&&&&")
s1.root.getSonByAttrVal(2).deleteTree()
#t.printTree()


[1, 2, 3]


&&&&&&&&&&&&&&&&&&&&&&&&&


In [48]:

import scipy.io
rawData = scipy.io.loadmat('Sogou_data/Sogou_webpage.mat')
docLabels   = rawData['doclabel']
wordVectors = rawData['wordMat']
docCnt=len(docLabels)
wordCnt =wordVectors.shape[1]


import numpy as np
import pandas as pd
df_wordVectors = pd.DataFrame(data=docLabels, columns=['label'])
df_docLabels   = pd.DataFrame(wordVectors,columns=[str(x) for x in range(1,1+wordCnt)])
cleanData = pd.concat([df_wordVectors, df_docLabels], axis=1)

labelCnt=len(set(cleanData['label']))


train = cleanData.sample(frac=0.8)
test = cleanData.drop(train.index)

print("train size=%d"%len(train))
print("test size=%d"%len(test))
train.shape
#train[train['label']==9]
#train

train size=11520
test size=2880


(11520, 1201)

In [90]:
def gini(Dataset):
    labels=list(set(Dataset['label']))
    nrow=float(len(Dataset))
    def ratioSqr(label):
        return (float(sum(Dataset['label']==label))/nrow)**2
    giniVal=1.0-sum(map(ratioSqr,labels))
    return giniVal
    
def giniIndex(Dataset, attrName):
    giniIndexVal=0.0
    nrow=float(len(Dataset))
    
    subDataset= Dataset[Dataset[attrName]==0]
    nrowSub = float(len(subDataset))
    giniIndexVal += gini(subDataset)/nrowSub
    
    subDataset= Dataset[Dataset[attrName]==1]
    nrowSub = float(len(subDataset))
    giniIndexVal += gini(subDataset)/nrowSub
    
    return giniIndexVal*nrow

def attrSelection(Dataset, Attributes, cheatMode=False):
    if cheatMode==True:
        Dataset=Dataset.sample(frac=0.3)
        
    def fucktion(attr):
        return giniIndex(Dataset,str(attr))
    optAttr=min(Attributes, key=fucktion)
    return optAttr

In [91]:
def identicalRows(df):
    for rowID in range(len(df)-1):
        if not df.iloc[rowID,1:].equals(df.iloc[rowID+1,1:]):
            return False
    return True
    

def treeGeneration(Dataset, Attributes):
    labels=train['label']
    
    # 1. 生成空节点node
    node = TreeNode(attrName="") 
    #tree = Tree(rootNode=node)
    
    # 2. Boundary Case 1
    if len(set(labels))==1: # 如果Dataset中的样本全属于同一类别C
        C = labels.iloc[0]
        node.sefLeaf(label=C)       # 将node标记成C类叶子节点
        tree = Tree(rootNode=node)
        return tree
        
    # 3. Boundary Case 2
    # 找到Dataset中样本数最多的类
    try:
        C=pd.Series.mode(labels)[0]
    except KeyError:
        C=labels.iloc[0]
    except IndexError:
        C=labels.iloc[0]
        
    if Attributes==[] or identicalRows(Dataset.iloc[:, Attributes]) :# 如果Attributes=∅,或者Dataset在Attributes的这几个属性上取值相同
        node.sefLeaf(label=C)       # 将node标记成叶子节点，其类别是Dataset中样本数最多的类
        tree = Tree(rootNode=node)
        return tree
    
    # 4. 选择最优划分属性: optAttr
    optAttr=attrSelection(Dataset, Attributes, cheatMode=True)
    print("selected attr: %d, batch size:%d"%(optAttr,len(Dataset)))
    # 5. 为optAttr属性的每一个值
    for optAttrVal in [0,1]:
        subDataset = Dataset[Dataset[str(optAttr)]==optAttrVal]
        if len(subDataset)==0: #为node加上一个分支，分支为叶子节点，叶子节点类别标记为Dataset中样本数最多的类
            leafNode = TreeNode(attrName="")
            leafNode.setLeaf(label=C) 
            leafNodeTree = Tree(rootNode=leafNode)
            node.addSon(attrVal=optAttrVal, subTree=leafNodeTree)
        else:
            sonTree=treeGeneration(Dataset=subDataset, Attributes=Attributes.remove(optAttr))
            node.addSon(attrVal=optAttrVal, subTree=sonTree)
        
    tree = Tree(rootNode=node) 
    return tree

In [92]:
treeGeneration(Dataset=train, Attributes=list(range(1,1+1200)))

selected attr: 977, batch size:11520


ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [72]:
def gini_deprecated(Dataset):
    labels=list(set(Dataset['label']))
    nrow=float(len(Dataset))
    giniVal=1.0
    for label in labels:
        percent=float(sum(Dataset['label']==label))/nrow
        giniVal=giniVal-percent**2
    return giniVal

def gini(Dataset):
    labels=list(set(Dataset['label']))
    nrow=float(len(Dataset))
    def ratioSqr(label):
        return (float(sum(Dataset['label']==label))/nrow)**2
    giniVal=1.0-sum(map(ratioSqr,labels))
    return giniVal

def giniIndex_deprecated(Dataset, attrName):
    giniIndexVal=0.0
    nrow=float(len(Dataset))
    for attrVal in [0,1]:
        subDataset= Dataset[Dataset[attrName]==attrVal]
        nrowSub = float(len(subDataset))
        giniIndexVal += gini(subDataset)*nrow/nrowSub
    return giniIndexVal
    
def giniIndex(Dataset, attrName):
    giniIndexVal=0.0
    nrow=float(len(Dataset))
    
    subDataset= Dataset[Dataset[attrName]==0]
    nrowSub = float(len(subDataset))
    giniIndexVal += gini(subDataset)/nrowSub
    
    subDataset= Dataset[Dataset[attrName]==1]
    nrowSub = float(len(subDataset))
    giniIndexVal += gini(subDataset)/nrowSub
    
    return giniIndexVal*nrow

def attrSelection_deprecated(Dataset, Attributes):
    minVal= 5.0
    optAttr = None
    for attr in Attributes:
        val=giniIndex(Dataset, str(attr))
        if val<minVal:
            minVal=val
            optAttr=attr
    return (optAttr,minVal)

def attrSelection(Dataset, Attributes):
    def fucktion(attr):
        return giniIndex(Dataset,str(attr))
    optAttr=min(Attributes, key=fucktion)
    return optAttr

import time
t1=time.time()
s=attrSelection(train,[1,2,3,4])
t2=time.time()
print((t2-t1)*1200)
print(s)

# 一次变量选择需要4分钟！！！！！


572.7278709411621
1
csdfsdfsdfsdfdsfgds
0.05815553665161133
11520
0.07921242713928223
11520


In [495]:
def identicalRows(df):
    for rowID in range(len(df)-1):
        if not df.iloc[rowID,1:].equals(df.iloc[rowID+1,1:]):
            return False
    return True


lst=np.random.randn(1, 5)
df2 = pd.DataFrame(lst)
df3 = pd.DataFrame(lst)

df = df2.append(df3)
df = df.append(pd.DataFrame(lst))
df = df.append(pd.DataFrame(np.random.randn(1, 5)))

identicalRows(df)

False

In [30]:
# 快速找最小值
from functools import reduce
import numpy as np
import time
import math

l=np.random.rand(1,1000000)*100
l=(l.tolist()[0])

def fuck(x):
    return math.sin((x+10)**2)
    
    
mina=sorted(l)[0]


t1=time.time()
mini=min(l, key=fuck)
t2=time.time()
print(mini)
print(t2-t1)
print("----------")


t1=time.time()
mina=min(map(fuck,l))
t2=time.time()
print(mina)
print(t2-t1)
print("----------")

t1=time.time()
minVal=999999999
optNum=None
for num in l:
    val=fuck(num)
    if val<minVal:
        minVal=val
        optNum=num
t2=time.time()
print(optNum)
print(t2-t1)
print("----------")



t1=time.time()
mini=reduce(lambda a,b: a if fuck(a) < fuck(b) else b, l[1:], l[0])
t2=time.time()
print(mini)
print(t2-t1)
print("----------")



48.932424677758746
0.42061781883239746
----------
-0.9999999999999897
0.4216275215148926
----------
48.932424677758746
0.4655306339263916
----------
48.932424677758746
0.8728272914886475
----------


In [46]:
# 快速求和
#import itertools
#from itertools import map

xlist=np.random.rand(1,1000000)*100
xlist=(xlist.tolist()[0])

def fuck(x):
    return (math.sin((x+10)**2))**2

t1=time.time()
he=1-sum(map(fuck, xlist))
t2=time.time()
print(he)
print(t2-t1)
print("====================")


t1=time.time()
he=0.0
for x in xlist:
    he+=fuck(x)
he=1-he
t2=time.time()
print(he)
print(t2-t1)
print("====================")


-499254.9876227111
0.5740292072296143
-499254.9876227111
0.6207835674285889


In [95]:
att=[11]
att.remove(11)
att

[]

In [101]:
import math
math.log(2,2)

1.0

In [None]:
import