In [4]:
import numpy as np, pandas as pd

In [6]:
################################################################################
def prepData(dataPathDir, fieldNames, featSlices, labelName, 
			 sep=',', transf=None):
	raw = np.genfromtxt(dataPathDir, delimiter=sep, dtype=None,
                        names=fieldNames, encoding='utf-8') # read dlm file
	if isinstance(featSlices, slice):
		dataFeats = fieldNames[featSlices] # list of feature names
	else:
		dataFeats = [fieldNames[i] for i in featSlices]
	return raw[dataFeats],dataFeats,raw[labelName]

def errRate(pred, actual, categorical=True):
	if categorical: # if categ., return classification err rate
		return sum(pred!=actual) / pred.size
	else: # if numeric, return RMSE
		return np.linalg.norm(pred-actual)/np.sqrt(pred.size)
    
def getCrossValidFolds(dataMat, classVec, nFolds=5, categorical=False):
	''' Cut N-fold cross validation of the data set
	Given a data matrix, a class vector, and the number of folds, the function
	randomly cuts a 5-fold cross validation. If the data is categorical, 
	stratified sampling is used.
	'''
	
	idx = np.arange(dataMat.shape[0]) # construct index of data rows
	if categorical:
		unqs = np.unique(classVec)
		tmpHold = [None] * len(unqs)
		for n,k in enumerate(unqs):
			grpIdx = idx[classVec==k] # idx of all elems in current class
			np.random.shuffle(grpIdx) # permutate idx for random selection
			tmpHold[n] = np.array_split(grpIdx, nFolds) # split: N equals
		chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
	else:
		np.random.shuffle(idx) # random shuffle data
		chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

	return chunks # return the indices for folds

In [91]:
list(zip(*np.unique(ringLabel, return_counts=True)))

[('10', 634),
 ('11', 487),
 ('12', 267),
 ('13', 203),
 ('14', 126),
 ('15', 103),
 ('16+', 261),
 ('6', 259),
 ('7', 391),
 ('8', 568),
 ('9', 689),
 ('<5', 189)]

In [71]:
z = np.genfromtxt(abalonePath, delimiter=',', dtype=None, names=abaloneNames, encoding='utf-8')
z[abaloneFeats]

array([('M', 0.455, 0.365, 0.095, 0.514 , 0.2245, 0.101 , 0.15 ),
       ('M', 0.35 , 0.265, 0.09 , 0.2255, 0.0995, 0.0485, 0.07 ),
       ('F', 0.53 , 0.42 , 0.135, 0.677 , 0.2565, 0.1415, 0.21 ), ...,
       ('M', 0.6  , 0.475, 0.205, 1.176 , 0.5255, 0.2875, 0.308),
       ('F', 0.625, 0.485, 0.15 , 1.0945, 0.531 , 0.261 , 0.296),
       ('M', 0.71 , 0.555, 0.195, 1.9485, 0.9455, 0.3765, 0.495)],
      dtype=[('sex', '<U1'), ('length', '<f8'), ('diameter', '<f8'), ('height', '<f8'), ('wholeHt', '<f8'), ('shuckWt', '<f8'), ('visceraWt', '<f8'), ('shellWt', '<f8')])

In [13]:
class DTnode:
    def __init__(self, attrib, compare=0):
        self.attrib = attrib
        self.isLeaf = True
        self.compareTo = compare
        self.children = dict()

    def addChild(self, node, val):
        self.isLeaf = False
        self.children[val] = node
        
    def getChild(self, val):
        return self.children[val]
    
    def getValues(self):
        return self.children.keys()

    def __repre__(self):
        if self.isLeaf:
            childTxt = 'terminal'
        else:
            childTxt = 'child: ' + str(list(self.children.keys()))
        return '[Node for %s, %s ]'%(self.attrib, childTxt)
    
    def toStr(nd, level=0):
        if nd.isLeaf:
            return 'class: %s\n' % nd.attrib
        else:
            ret = 'Attribute [' + nd.attrib + "]:\n"
            nx = level + 1
            for key in nd.children:
                ret += " "*nx*2 + 'value %s, '%key \
                    + toStr(nd.children[key],nx)
            return ret
    
    def __str__(self):
        return toStr(self)
        

In [21]:
def Entropy(array):
    counts = np.unique(array, return_counts=True)[1]
    probs = counts / counts.sum()
    return -(probs*np.log2(probs)).sum()

def IntInfo(counts):
    s = sum(counts)
    return -np.sum(np.log2(counts)*counts)/s + np.log2(s)

################################################################################



In [7]:
################################################################################
abalonePath = './data/abalone.data'
abaloneNames = ['sex', 'length', 'diameter', 'height', 'wholeHt',
                'shuckWt', 'visceraWt', 'shellWt', 'rings']
abaloneData, abaloneFeats, ringVec = prepData(abalonePath,
                                              abaloneNames, slice(-1),'rings')
ringLabel = ringVec.astype(str)
ringLabel[ringVec<=5] = '<5'
ringLabel[ringVec>=16] = '16+'

In [27]:
a = np.array([True, False, False, True])
sum(a)

2

In [50]:
def getSplitPts(data,labels):
    srtIdx = np.argsort(data) # get sorted index for data vector
    srtdData = data[srtIdx] # data in sorted order
    midpoints = (srtdData[:-1] + srtdData[1:])/2 # midpoints between data pts
    srtdLabls = labels[srtIdx] # rearrange labels by sorted data order
    diffLabel = srtdLabls[:-1] != srtdLabls[1:] # find midpt where labels changed
    return midpoints[diffLabel] # return midpts where labels are different

def getBestSplitInfo(data, labels, splitPts):
    bestEntropy = np.Inf
    bestPoint = None
    bestPr = -1
    for n,pt in enumerate(splitPts):
        LT = data < pt
        prLT = sum(LT) / data.size
        ent = prLT*Entropy(labels[LT]) + (1-prLT)*Entropy(labels[~LT])
        if ent < bestEntropy:
            bestEntropy = thisEnt
            bestPoint = pt
            bestPr = prLT
    intrInfo = -bestPr*np.log2(bestPr) - (1-bestPr)*np.log2(1-bestPr)
    return bestEntropy,bestPoint,intrInfo

def SplitInfo(xs, ys):
    if np.issubdtype(xs.dtype, np.number): # numeric features
        splitPts = getSplitPoints(xs, ys)
        meanEnt,splitPt,intrinInfo = getBestSplitInfo(xs, ys, splitPts)
    else: # categorical features
        vals, Ns = np.unique(xs, return_counts=True)
        meanEnt = sum(Ns/len(ys) * [Entropy(ys[xs==v]) for v in vals])
        intrinsInfo = Entropy(xs)
        splitPt = None
    return meanEnt,intrinsInfo,splitPt



In [53]:
a = np.array( [10,10,20,10,20,20,20,30, 30,50,40,40] )
aCounts = np.unique(a,return_counts=True)[1]
z

[10, 10, 20, 10, 20, 20, 20, 30, 30, 50, 40, 40]

In [52]:
np.array(z) * z

array([ 100,  100,  400,  100,  400,  400,  400,  900,  900, 2500, 1600,
       1600])

In [53]:
x = np.random.rand(len(a))
x.sort()
print(list(zip(x,a)))

getSplitPts(x,a)

[(0.1409071979108386, 10), (0.22956749465992676, 10), (0.24396124711773237, 20), (0.2848779032897587, 10), (0.43729318365272074, 20), (0.5744742472226008, 20), (0.7917719120373218, 20), (0.8103201792707221, 30), (0.8234293458874453, 30), (0.8484049041354896, 50), (0.9442117713873419, 40), (0.993248981122507, 40)]


array([0.23676437, 0.26441958, 0.36108554, 0.80104605, 0.83591713,
       0.89630834])