In [5]:
import numpy as np
import pandas as pd


In [6]:
def entropy(col):
    counts=np.unique(col,return_counts=True)
    N=float(col.shape[0])
    ent=0.0
    for ix in counts[1]:
        p=ix/N
        ent+=(-1.0*p*np.log2(p))
    return ent
# Entropy Takes Input as a series datatype of pandas 

In [7]:
# Takes input as panda dataframe
def divide_data(xdata,ydata,fkey,fval):
    x_left=pd.DataFrame([],columns=xdata.columns)
    y_left=pd.DataFrame([],columns=ydata.columns)
    x_right=pd.DataFrame([],columns=xdata.columns)
    y_right=pd.DataFrame([],columns=ydata.columns)
    
    for ix in range(xdata.shape[0]):
        val=xdata[fkey].loc[ix]
        if(val>fval):
            x_right=x_right.append(xdata.loc[ix])
            y_right=y_right.append(ydata.loc[ix])
        else:
            x_left=x_left.append(xdata.loc[ix])
            y_left=y_left.append(ydata.loc[ix])
    return x_left,y_left,x_right,y_right
# Takes input as pandas data frame 
def information_gain(xdata,ydata,fkey,fval):
    x_left,y_left,x_right,y_right=divide_data(xdata,ydata,fkey,fval)
    
    l=float(x_left.shape[0])/xdata.shape[0]
    r=float(x_right.shape[0]/xdata.shape[0])
    if x_left.shape[0]==0 or x_right.shape[0]==0:
        return -1000000
    
    ylabels=ydata.columns
    ylabel=ylabels[0]
    i_gain=entropy(ydata[ylabel])-(l*entropy(y_left[ylabel]) + r*entropy(y_right[ylabel]))
    return i_gain 

In [8]:
class DecisionTree:
    def __init__(self,max_depth=5,depth=0):
        self.left=None
        self.right=None
        self.depth=depth
        self.fkey=None
        self.fval=None
        self.max_depth=max_depth
        self.target=None
    def train(self,x_train,y_train):
        features=list(x_train.columns)
        info_gains=[]
        for feature in features:
            i_gain=information_gain(x_train,y_train,feature,x_train[feature].mean())
            info_gains.append(i_gain)
        self.fkey=features[np.argmax(info_gains)]
        self.fval=x_train[self.fkey].mean()
        print('Feature:',self.fkey)
        #Split The Data 
        x_left,y_left,x_right,y_right=divide_data(x_train,y_train,self.fkey,self.fval)
        
        x_left=x_left.reset_index(drop=True)
        y_left=y_left.reset_index(drop=True)
        x_right=x_right.reset_index(drop=True)
        y_right=y_right.reset_index(drop=True)
        
        ylabels=y_train.columns
        ylabel=ylabels[0]
        if(self.depth>=self.max_depth):
            if(y_train[ylabel].mean()>=0.5):
                self.target=1
            else:
                self.target=0
            return 
        if(x_left.shape[0]==0 or x_right.shape[0]==0):
            if(y_train[ylabel].mean()>=0.5):
                self.target=1
            else:
                self.targe=0
            return 
        #Recursive 
        self.left=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(x_left,y_left)
        self.right=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(x_right,y_right)
        
        if(y_train[ylabel].mean()>=0.5):
            self.target=1
        else:
            self.target=0
        return 
    def predict(self,test):
        #Test will be a dataframe of same format as used for training data 
        if(test[self.fkey] > self.fval):
            #Move to right subtree
            # Base Case When right subtree doesnt exist => we reached a leaf node
            if(self.right is None):
                return self.target 
            else:
                return self.right.predict(test)
        else:
            #Move to left subtree
            # Base Case if left subtree doesnt exist =>We have reached a leaf node
            if(self.left is None ):
                return self.target
            else:
                return self.left.predict(test)
    