In [2]:
import numpy as np, pandas as pd

In [3]:
def prepData(dataPathDir, fieldNames, featSlices, labelName, 
			 sep=',', transf=None):
	raw = pd.read_csv(dataPathDir , sep=sep, names=fieldNames) # read dlm file
	if isinstance(featSlices, slice):
		dataFeats = fieldNames[featSlices] # list of feature names
	else:
		dataFeats = [fieldNames[i] for i in featSlices]
	if transf is None: # no transformation
		dataMat = raw[dataFeats].values # original values
	elif transf.lower() == 'std' : # if choose to standardize data
		meanVals = raw[dataFeats].mean().values # mean of all features
		stdVals = raw[dataFeats].std().values # standard deviations
		dataMat = (raw[dataFeats].values - meanVals) / stdVals # [x-E(x)]/S(X)
	elif transf.lower() == 'rescale': # rescale to values in [0,1]
		mins = raw[dataFeats].min().values # min of feature vals
		maxs = raw[dataFeats].max().values # max of feature vals
		dataMat = (raw[dataFeats].values-mins) / (maxs-mins) # x-min/range(x)
	else: # error out
		raise Exception('No such transformation available')
	return dataMat,dataFeats,raw[labelName].values

def errRate(pred, actual, categorical=True):
	if categorical: # if categ., return classification err rate
		return sum(pred!=actual) / pred.size
	else: # if numeric, return RMSE
		return np.linalg.norm(pred-actual)/np.sqrt(pred.size)
    
def getCrossValidFolds(dataMat, classVec, nFolds=5, categorical=False):
	''' Cut N-fold cross validation of the data set
	Given a data matrix, a class vector, and the number of folds, the function
	randomly cuts a 5-fold cross validation. If the data is categorical, 
	stratified sampling is used.
	'''
	
	idx = np.arange(dataMat.shape[0]) # construct index of data rows
	if categorical:
		unqs = np.unique(classVec)
		tmpHold = [None] * len(unqs)
		for n,k in enumerate(unqs):
			grpIdx = idx[classVec==k] # idx of all elems in current class
			np.random.shuffle(grpIdx) # permutate idx for random selection
			tmpHold[n] = np.array_split(grpIdx, nFolds) # split: N equals
		chunks = [np.hstack(k) for k in zip(*tmpHold)] # concat sub chunks
	else:
		np.random.shuffle(idx) # random shuffle data
		chunks = np.array_split(idx, nFolds) # split into N equal sized chunks

	return chunks # return the indices for folds

In [13]:
class DTnode:
    def __init__(self, attrib):
        self.attrib = attrib
        self.isLeaf = True
        self.children = dict()

    def addChild(self, node, val):
        self.isLeaf = False
        self.children[val] = node
        
    def getChild(self, val):
        return self.children[val]
    
    def getValues(self):
        return self.children.keys()

    def __repre__(self):
        if self.isLeaf:
            childTxt = 'terminal'
        else:
            childTxt = 'child: ' + str(list(self.children.keys()))
        return '[Node for %s, %s ]'%(self.attrib, childTxt)
    
    def toStr(nd, level=0):
        if nd.isLeaf:
            return 'class: %s\n' % nd.attrib
        else:
            ret = 'Attribute [' + nd.attrib + "]:\n"
            nx = level + 1
            for key in nd.children:
                ret += " "*nx*2 + 'value %s, '%key \
                    + toStr(nd.children[key],nx)
            return ret
    
    def __str__(self):
        return toStr(self)
        

In [50]:
def Entropy(probs):
    return sum(probs*np.log2(probs))

def getProbs(labels):
    uniqLbl,counts = np.unique(labels, return_counts=True)
    return uniqLbl,counts/len(labels)

def getSplitPts(data,labels):
    srtIdx = np.argsort(data)
    srtdData = data[srtIdx]
    midpoints = (srtdData[:-1] + srtdData[1:])/2
    srtdLabls = labels[srtIdx]
    diffLabel = srtdLabls[:-1] != srtdLabls[1:]
    return midpoints[diffLabel]

In [20]:
Entropy(np.array([1]))

0.0

In [28]:
a = np.array( [10,10,20,10,20,20,20,30, 30,50,40,40] )
x,y = getProbs(np.array([1,1,1,1,1]))
Entropy(y)

0.0

In [53]:
x = np.random.rand(len(a))
x.sort()
print(list(zip(x,a)))

getSplitPts(x,a)

[(0.1409071979108386, 10), (0.22956749465992676, 10), (0.24396124711773237, 20), (0.2848779032897587, 10), (0.43729318365272074, 20), (0.5744742472226008, 20), (0.7917719120373218, 20), (0.8103201792707221, 30), (0.8234293458874453, 30), (0.8484049041354896, 50), (0.9442117713873419, 40), (0.993248981122507, 40)]


array([0.23676437, 0.26441958, 0.36108554, 0.80104605, 0.83591713,
       0.89630834])