In [1]:
import numpy as np, pandas as pd

In [124]:
################################################################################
def prepData(dataPathDir, fieldNames, featSlices, labelName, sep=','):
	raw = np.genfromtxt(dataPathDir, delimiter=sep, dtype=None,
                        names=fieldNames, encoding='utf-8') # read dlm file
	if isinstance(featSlices, slice):
		dataFeats = fieldNames[featSlices] # list of feature names
	else:
		dataFeats = [fieldNames[i] for i in featSlices]
	return raw[dataFeats],dataFeats,raw[labelName]

def errRate(pred, actual, categorical=True):
	if categorical: # if categ., return classification err rate
		return sum(pred!=actual) / pred.size
	else: # if numeric, return RMSE
		return np.linalg.norm(pred-actual)/np.sqrt(pred.size)
    
def getXVFolds(dataMat, classVec, nFolds=5, categorical=False):
	''' Cut N-fold cross validation of the data set
	Given a data matrix, a class vector, and the number of folds, the function
	randomly cuts a 5-fold cross validation. If the data is categorical, 
	stratified sampling is used.
	'''
	
	idx = np.arange(dataMat.shape[0]) # construct index of data rows
	if categorical:
		unqs = np.unique(classVec)
		tmpHold = [None] * len(unqs)
		for n,k in enumerate(unqs):
			grpIdx = idx[classVec==k] # idx of all elems in current class
			np.random.shuffle(grpIdx) # permutate idx for random selection
			tmpHold[n] = np.array_split(grpIdx, nFolds) # split: N equals
		chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
	else:
		np.random.shuffle(idx) # random shuffle data
		chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

	return chunks # return the indices for folds

def splitData(data, labels):
    x = getXVFolds(data, labels, nFolds=10, categorical=True)
    pruneData,pruneLabel = data[x[0]], labels[x[0]]
    xvSetIdx = np.hstack(x[1:]) # rest of the data
    xvData,xvLabel = data[xvSetIdx], labels[xvSetIdx]
    
    xvFolds = getCrossValidFolds(xvData, xvLabel, categorical=True)
    return (xvData,xvLabel),(xvFolds),(pruneData,pruneLabel)

In [134]:
################################################################################
carPath = './data/car.data'
carNames = ['buying','maint','doors','persons','lug_boot','safety','accept']
carData, carFeats, acceptVec = prepData(carPath, carNames, slice(-1), 'accept')

In [142]:
(xvData,xvLabel),xvFolds,(prnData,prnLabel) = splitData(carData, acceptVec)
vldData,vldLabel = xvData[xvFolds[0]],xvLabel[xvFolds[0]]
tmpIdx = np.hstack(xvFolds[1:])
trainData,trainLabel = xvData[tmpIdx],xvLabel[tmpIdx]

In [216]:
z = TrainDTree(trainData, trainLabel)
z.combineChildNodes()
print(z)

Attribute [safety]:
    value = low, class: unacc
    value = high, Attribute [persons]:
        value = 4, Attribute [buying]:
            value = med, Attribute [maint]:
                value = low, Attribute [lug_boot]:
                    value = big, class: vgood
                    value = med, class: vgood
                    value = small, class: good
                value = high, class: acc
                value = med, Attribute [lug_boot]:
                    value = big, class: vgood
                    value = med, Attribute [doors]:
                        value = 2, class: acc
                        value = 3, class: acc
                        value = 5more, class: vgood
                    value = small, class: acc
                value = vhigh, class: acc
            value = high, Attribute [maint]:
                value = med, class: acc
                value = high, class: acc
                value = low, class: acc
                value = vhigh, class: unacc
      

In [217]:
pred = PredictDTree(z, vldData)
np.mean(pred==vldLabel)

0.9038461538461539

In [218]:
PruneDTree(z, prnData, prnLabel)
len(z.__str__().split('\n'))

254

In [219]:
pred = PredictDTree(z, vldData)
np.mean(pred==vldLabel)

0.8974358974358975

In [None]:
list(zip(*np.unique(ringLabel, return_counts=True)))

In [212]:
################################################################################
class DTnode:
    def __init__(self, attrib, splitPt=None, majority=None):
        self.attrib = attrib
        self.preEval = majority # early evaluation based on training majority
        self.splitPoint = splitPt # if None, then categorical
        self.children = dict()
        self.nCorr = -1 # for storing error info for pruning

    def addChild(self, node, val):
        self.children[val] = node
        
    def isLeaf(self):
        return len(self.children) == 0
        
    def getChild(self, val):
        return self.children[val]
    
    def getValues(self):
        return self.children.keys()
    
    def getChildCorrNum(self):
        return [nd.nCorr for k,nd in self.children.items()]
    
    def makeLeafNode(self, attrib, nCorrPred=-1):
        self.attrib = attrib
        self.preEval = None
        self.children = dict()
        self.nCorr = nCorrPred
    
    def combineChildNodes(self):
        if self.isLeaf():
            return set([self.attrib])
        subLabl = set()
        for k,child in self.children.items(): # loop over all child nodes
            subLabl.update( child.combineChildNodes() )
        if len(subLabl) == 1: # only one class for all child nodes
            self.makeLeafNode( next(iter(subLabl)) )
        return subLabl

    def __repre__(self):
        if self.isLeaf():
            childTxt = 'terminal'
        else:
            childTxt = 'child: ' + str(list(self.children.keys()))
        return '[Node for %s, %s ]'%(self.attrib, childTxt)
    
    def toStr(self, level=0):
        if self.isLeaf():
            return 'class: %s\n' % self.attrib
        else:
            ret = 'Attribute [' + self.attrib + "]:\n"
            nx = level + 1
            for key in self.children:
                if self.splitPoint is None: # categorical var
                    txt = '= %s'%key
                else: # numerical var
                    txt = '%s %f'%(key,self.splitPoint)
                ret += " "*nx*4 + 'value %s, '%txt \
                    + self.children[key].toStr(nx)
            return ret
    
    def __str__(self):
        return self.toStr()

In [4]:
a = DTnode('feat1', 2.45)
a.addChild(DTnode('feat2'), '<')
a.addChild(DTnode('class1'), '>')
a.getChild('<').addChild(DTnode('class2'), 'a')
a.getChild('<').addChild(DTnode('class2'), 'b')
print(a)

a.combineChildNodes()
print(a)

Attribute [feat1]:
    value < 2.450000, Attribute [feat2]:
        value = a, class: class2
        value = b, class: class2
    value > 2.450000, class: class1

Attribute [feat1]:
    value < 2.450000, class: class2
    value > 2.450000, class: class1



In [5]:
def Entropy(array):
    counts = np.unique(array, return_counts=True)[1]
    probs = counts / counts.sum()
    return -(probs*np.log2(probs)).sum()

def IntInfo(counts):
    s = sum(counts)
    return -np.sum(np.log2(counts)*counts)/s + np.log2(s)

################################################################################



In [6]:
def getSplitPoints(data,labels):
    srtIdx = np.argsort(data) # get sorted index for data vector
    srtdData = data[srtIdx] # data in sorted order
    midpoints = (srtdData[:-1] + srtdData[1:])/2 # midpoints between data pts
    srtdLabls = labels[srtIdx] # rearrange labels by sorted data order
    diffLabel = srtdLabls[:-1] != srtdLabls[1:] # find midpt where labels changed
    return midpoints[diffLabel] # return midpts where labels are different

def getBestSplitInfo(data, labels, splitPts):
    bestEntropy = np.Inf
    bestPoint = None
    bestPr = -1
    for n,pt in enumerate(splitPts):
        LT = data < pt
        prLT = sum(LT) / data.size
        ent = prLT*Entropy(labels[LT]) + (1-prLT)*Entropy(labels[~LT])
        if ent < bestEntropy:
            bestEntropy = ent
            bestPoint = pt
            bestPr = prLT
            
    if (bestPr-0) < np.finfo(bestPr.dtype).eps: # if homogenous data, prob=0
        intrInfo = 0
    else:
        intrInfo = -bestPr*np.log2(bestPr) - (1-bestPr)*np.log2(1-bestPr)
    return bestEntropy,bestPoint,intrInfo

################################################################################

def SplitInfo(xs, ys):
    if np.issubdtype(xs.dtype, np.number): # numeric features
        splitPts = getSplitPoints(xs, ys)
        meanEnt,splitPt,intrinsVal = getBestSplitInfo(xs, ys, splitPts)
    else: # categorical features
        vals, Ns = np.unique(xs, return_counts=True)
        meanEnt = sum(Ns/len(ys) * [Entropy(ys[xs==v]) for v in vals])
        intrinsVal = Entropy(xs)
        splitPt = None
    return meanEnt,intrinsVal,splitPt

def selectBestFeature(data, labels, useRatio=True):
    features = data.dtype.names
    info = Entropy(labels)
    gains = np.empty(len(features))
    gainRatios = np.empty(len(features))
    splitPts = [None] * len(features)
    for n,feat in enumerate(features):
        expEntropy,intrnVal,splitPts[n] = SplitInfo(data[feat], labels)
        gains[n] = info-expEntropy
        gainRatios[n] = gains[n] / (0.01+intrnVal if useRatio else 1)
        #print("%s exp ent: %f"%(feat,gainRatios[n]))
    maxN = np.argmax(gainRatios)
    return features[maxN],gainRatios[maxN],splitPts[maxN],gains[maxN]

In [210]:
################################################################################
def TrainDTree(allData, allLabels, minGain=0):
    def c4_5(idx, featureSet, defLabel):
        if sum(idx) == 0:# empty data, class = default label
            return DTnode(defLabel)
        data,labels = allData[idx][list(featureSet)],allLabels[idx]
        
        (values,counts) = np.unique(labels, return_counts=True)
        majority = str(values[np.argmax(counts)]) # get majority class as default
        if len(counts)==1 or not featureSet: # homogenous or no attribs
            return DTnode(majority)
        
        bestFeat,gainRatio,splitPt,gain = selectBestFeature(data,labels)
        #print('GainR=%f, Gain=%f'%(gainRatio,gain))
        if gain < minGain: # early stopping if gain < defined thresh
            return DTnode(majority)
        
        featSubset = featureSet - set([bestFeat])
        node = DTnode(bestFeat, splitPt, majority)
        if splitPt is None: # no split point, categorical feature
            for val in set(data[bestFeat]):
                subIdx = idx[data[bestFeat] == val]
                child = c4_5(subIdx, featSubset, majority)
                node.addChild(child, val)
        else: # numerical feature, 2 child nodes
            lessThan = data[bestFeat] < splitPt
            child = c4_5(idx[lessThan], featSubset, majority)
            node.addChild(child, '<')
            child = c4_5(idx[~lessThan], featSubset, majority)
            node.addChild(child, '>=')
        return node
################################################################################

    uniqLabels,uniqCounts = np.unique(allLabels, return_counts=True)
    labelMajority = uniqLabels[np.argmax(uniqCounts)]
    allFeatures = set(allData.dtype.names) # set of all features
    allIdx = np.arange(allData.size) # numeric idx of all rows
    return c4_5(allIdx, allFeatures, labelMajority) # root of DTree

In [8]:
################################################################################
abalonePath = './data/abalone.data'
abaloneNames = ['sex', 'length', 'diameter', 'height', 'wholeHt',
                'shuckWt', 'visceraWt', 'shellWt', 'rings']
abaloneData, abaloneFeats, ringVec = prepData(abalonePath,
                                              abaloneNames, slice(-1),'rings')
ringLabel = ringVec.astype(str)
ringLabel[ringVec<=5] = '<5'
ringLabel[ringVec>=16] = '16+'

In [11]:
testData = abaloneData[np.arange(1000)]
testLabel = ringLabel[np.arange(1000)]

(1000,)

In [153]:
testTree = TrainDTree(testData, testLabel)
print(testTree)

Attribute [Outlook]:
    value = Rainy, Attribute [Windy]:
        value = True, class: N
        value = False, class: P
    value = Sunny, Attribute [Humidity]:
        value = Normal, class: P
        value = High, class: N
    value = Overcast, class: P



In [11]:
np.seterr(all='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [215]:
def PredictDTree(tree, data):
    def classify(node, idx):
        ndData = data[idx] # portion of data for this node
        if node.isLeaf():
            result[idx] = node.attrib
            return
        else:
            feat = node.attrib
            if node.splitPoint is None: # categorical
                for k in node.getValues():
                    subIdx = idx[ ndData[feat]==k ]
                    classify(node.getChild(k), subIdx)
            else: # numeric
                lessThan = ndData[feat]<node.splitPoint
                classify(node.getChild('<'), idx[lessThan])
                classify(node.getChild('>='), idx[~lessThan])
            return

    allIdx = np.arange(data.size)
    result = np.empty(data.size, object)
    classify(tree, allIdx)
    return result

In [214]:
################################################################################    
def PruneDTree(tree, data, actuals):
    def prune(node, idx):
        ndData,ndActs = data[idx],actuals[idx] # data and labels for the node
        if node.isLeaf():
            node.nCorr = sum(node.attrib==ndActs) # save nCorrect preds
            return
        else:
            feat = node.attrib
            if node.splitPoint is None: # categorical
                for k in node.getValues():
                    subIdx = idx[ ndData[feat]==k ]
                    prune(node.getChild(k), subIdx)
            else: # numeric
                lessThan = ndData[feat]<node.splitPoint
                prune(node.getChild('<'), idx[lessThan])
                prune(node.getChild('>='), idx[~lessThan])
            
            node.nCorr = sum(node.getChildCorrNum()) # sum childNodes error nums
            nCorrNaive = sum(ndActs==node.preEval) # nCorr using label majority
            if nCorrNaive > node.nCorr: # if majority class is better than C4.5
                node.makeLeafNode(node.preEval, nCorrNaive)
            return
                
################################################################################    
    allIdx = np.arange(data.size)
    prune(tree, allIdx)
    return

In [63]:
x = np.array(range(10))
y = np.array(['A', 'B', 'A', 'A', 'A', 'B', 'A', 'B', 'B', 'A'])

getBestSplitInfo(x,y, getSplitPts(x,y))


(0.8464393446710154, 4.5, 1.0)

In [53]:
a = np.array( [10,10,20,10,20,20,20,30, 30,50,40,40] )
aCounts = np.unique(a,return_counts=True)[1]
z

[10, 10, 20, 10, 20, 20, 20, 30, 30, 50, 40, 40]

In [14]:
raw = np.array([('Sunny', 'Hot', 'High', 'False', 'N'),
                ('Sunny', 'Hot', 'High', 'True', 'N'),
                ('Overcast', 'Hot', 'High', 'False', 'P'), 
                ('Rainy', 'Mild', 'High', 'False', 'P'), 
                ('Rainy', 'Cool', 'Normal', 'False', 'P'), 
                ('Rainy', 'Cool', 'Normal', 'True', 'N'), 
                ('Overcast', 'Cool', 'Normal', 'True', 'P'), 
                ('Sunny', 'Mild', 'High', 'False', 'N'), 
                ('Sunny', 'Cool', 'Normal', 'False', 'P'), 
                ('Rainy', 'Mild', 'Normal', 'False', 'P'), 
                ('Sunny', 'Mild', 'Normal', 'True', 'P'),
                ('Overcast', 'Mild', 'High', 'True', 'P'),
                ('Overcast', 'Hot', 'Normal', 'False', 'P'), 
                ('Rainy', 'Mild', 'High', 'True', 'N')],
               dtype=[('Outlook','U8'), ('Temperature','U4'), ('Humidity','U6'),
                      ('Windy','U5'), ('Class','U1')]
            )
testData = raw[['Outlook','Temperature','Humidity','Windy']]
testLabel = raw['Class']

SplitInfo(testData['Temperature'], testLabel)
selectBestFeature(testData, testLabel, useRatio=False)

('Outlook', 0.24674981977443933, None, 0.24674981977443933)

In [16]:
testTree = TrainDTree(testData,testLabel)
print(testTree)

Attribute [Outlook]:
    value = Sunny, Attribute [Humidity]:
        value = High, class: N
        value = Normal, class: P
    value = Rainy, Attribute [Windy]:
        value = False, class: P
        value = True, class: N
    value = Overcast, class: P



In [17]:
val = np.array([('Sunny', 'Hot', 'High', 'False', 'N'),
                ('Sunny', 'Hot', 'High', 'True', 'P'),
                ('Overcast', 'Hot', 'High', 'False', 'P'), 
                ('Rainy', 'Mild', 'High', 'False', 'P')],
               dtype=[('Outlook','U8'), ('Temperature','U4'), ('Humidity','U6'),
                      ('Windy','U5'), ('Class','U1')]
            )
valData = val[['Outlook','Temperature','Humidity','Windy']]
valAct = val['Class']

In [22]:
pred = PredictDTree(testTree, valData)

In [2]:
np.fabs(-2)

NameError: name 'np' is not defined