In [1]:
import numpy as np
import random
from statistics import mode
import csv

In [2]:
def retList(filename):
    f = open(filename,'r')
    dataString = f.readlines()
    data = []
    for i in range(len(dataString)):
        data.append(dataString[i].split())
    return np.array(data,dtype=float)

def accuracy(label, predicted):
    N= len(predicted)
    crt = 0
    for i in range(N):
        if label[i] == predicted[i]:
            crt=crt+1
    if N==0:
        raise ValueError("Input array size must be greater than 0")
    return crt/N

def randomList(n, bool):
    if bool:
        return np.random.choice(n-1,n//3)
    else :
        return np.array(range(n-1))

In [3]:
def giniIndex(data):
    pos, neg = 0,0
    for i in data:
        if i[-1]==0:
            neg=neg+1
        else :
            pos=pos+1
    
    pos=pos/(pos+neg)
    neg=1-pos
    return 1-pos*pos-neg*neg

def entropy(data): 
    if len(data)==0:
        return 0   
    n0,n1=0,0
    for i in data:
        if i[-1]==0:
            n0=n0+1
        else:
            n1=n1+1
    p1=n0/(n1+n0)
    p2=1-p1
    if p1 != 0:
        p1=-p1*np.log(p1)
    if p2 != 0:
        p2=-p2*np.log(p2)
    return p1+p2



In [4]:
def BestSplit(data, numThreshold, func, featureList):
    InfoGain = 0
    Left=None
    Right=None
    Threshold=None
    Feature=None
    for feature in featureList:
        minVal = np.min(data[:,feature])
        maxVal = np.max(data[:,feature])
        Tvals = np.linspace(minVal,maxVal,numThreshold+1,endpoint=False)[1:]

        for threshold in Tvals:
            left=[]
            right=[]
            for instance in data:
                if instance[feature]<threshold:
                    left.append(instance)
                else :
                    right.append(instance)
            if len(left)==len(data) or len(right)==len(data):
                continue
            infogain = func(data)-len(left)/len(data)*func(left)-len(right)/len(data)*func(right)
            if infogain > InfoGain:
                Threshold = threshold
                Feature = feature
                InfoGain = infogain
                Left= np.array(left)
                Right = np.array(right)
    # if len(Left)==len(data) or len(Right)==len(data):
    #     Left=None
    #     Feature=None
    #     Threshold=None
    #     Right = None
    return {'feature':Feature, 'threshold': Threshold, 'left':Left, 'right':Right}




In [5]:
class DecisionTree():
    tree={}

    def learn(self, train, numThreshold, func, randomFeatures):
        tree = {'feature':None, 'threshold':None, 'leftTree':None,'rightTree':None, 'value':None}

        if func(train)!=0:
            split=BestSplit(train,numThreshold,func,randomList(len(train[0]),randomFeatures))
            tree['feature']=split['feature']
            tree['threshold']=split['threshold']
            if tree['feature'] is None:
                tree['value'] = mode(train[:,-1])
                return tree
            else:
                tree['leftTree'] = DecisionTree.learn(self,split['left'],numThreshold,func,randomFeatures)
                tree['rightTree'] = DecisionTree.learn(self,split['right'],numThreshold,func,randomFeatures)
                return tree
        else :
            tree['value']=mode(train[:,-1])
            return tree
    
    def classify(self,tree,instance):
        if tree['value'] is not None:
            return tree['value']
        else:
            if instance[tree['feature']] < tree['threshold']:
                return self.classify(tree['leftTree'],instance)
            else :
                return self.classify(tree['rightTree'],instance)
    
def runDecisionTree(train, test, numThreshold=10, func=entropy, randomFeatures=False):
    Tree=DecisionTree()
    tree= Tree.learn(train,numThreshold,func,randomFeatures)
    predicted = []
    for instance in test:
        predicted.append(Tree.classify(tree,instance[:-1]))
    print(accuracy(test[:,-1],predicted))


In [6]:
# class randomForest():
#     forest=[]
#     def build(self,numTrees, train,numThreshold, func):
        

In [10]:
data = retList("data.txt")
random.shuffle(data)
n=len(data)
train = np.array([x for i, x in enumerate(data) if i % 10 > 2 ])
test = np.array([x for i, x in enumerate(data) if i % 10 <= 2])

# Tree= DecisionTree()
# tree= Tree.learn(train,10,giniIndex,False)
# print(tree)
# print(len(train[:len(train)//2]))
runDecisionTree(train,test,10,entropy, False)

1610
0.9717595944967415


In [8]:
with open("wine-dataset.csv") as f:
    next(f, None)
    Tdata=[]
    for line in csv.reader(f,delimiter=","):
        row=[float(x) for x in line]
        Tdata.append(row)
data = np.array(Tdata)
random.shuffle(data)
n=len(data)
train= data[:int(0.7*n)]
np.random.shuffle(train)
test= data[int(0.7*n):]
# Tree= DecisionTree()
# tree= Tree.learn(train,10,giniIndex,False)
# print(tree)
runDecisionTree(train,test,10)

0.8639455782312925
