In [4]:
import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [5]:
def entropy(sequence):
    items, counts = np.unique(sequence, return_counts=True)
    probs = counts/np.sum(counts)
    return -np.sum(probs * np.log2(probs))

In [6]:
def infogain(X, y, label):
    pivot = X[label].mean() # just an assumption
    
    left = y.loc[X[label] < pivot]
    right = y.loc[X[label] >= pivot]
    
    prob_l, prob_r = len(left)/len(y), len(right)/len(y)
    
    return entropy(y) - prob_l * entropy(left) - prob_r * entropy(right)

In [7]:
data = pd.read_csv("processed.csv")

In [8]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0.0,3.0,1.0,22.00,1.0,0.0
1,1.0,1.0,0.0,38.00,1.0,0.0
2,1.0,3.0,0.0,26.00,0.0,0.0
3,1.0,1.0,0.0,35.00,1.0,0.0
4,0.0,3.0,1.0,35.00,0.0,0.0
...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.00,0.0,0.0
887,1.0,1.0,0.0,19.00,0.0,0.0
888,0.0,3.0,0.0,21.75,1.0,2.0
889,1.0,1.0,1.0,26.00,0.0,0.0


In [10]:
X, y = data.drop(["Survived"], axis=1), data.Survived

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
for col in X.columns:
    print(col, infogain(X, y, col))

Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.001105606725090691
SibSp 0.009584541813400127
Parch 0.015380754493137666


In [21]:
class Node:
    
    def __init__(self, label=None, pivot=None, result=None):
        self.label = label
        self.pivot = pivot
        self.result = result
        
        self.left = None
        self.right = None
        
    def __repr__(self):
        if self.label:
            return "{} : {}".format(self.label, self.pivot)
        else:
            return ["Dead", "Alive"][int(self.result > .5)]

In [98]:
class CustomDecisionTree:
    
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.root = self.fit_rec(X, y, 0)
    
    def fit_rec(self, X, y, depth):
        if depth == self.max_depth:
            node = Node(result=y.mean())
            return node
        
        gains = []
        
        for col in X.columns:
            gains.append([infogain(X, y, col), col])
            
        max_gain, selected_column = sorted(gains)[-1]
        
        if max_gain <= 0:
            node = Node(result=y.mean())
            return node
        
        
        pivot = X[selected_column].mean()
        left, right = X[selected_column] < pivot, X[selected_column] >= pivot
        
        X_left, X_right = X.loc[left], X.loc[right]
        y_left, y_right = y.loc[left], y.loc[right]
        
        node = Node(label=selected_column, pivot=pivot)
        node.left = self.fit_rec(X_left, y_left, depth + 1)
        node.right = self.fit_rec(X_right, y_right, depth + 1)
        
        return node
    
    def display(self, node, indent=""):
        if node == None:
            return
        
        print(indent, node)
        self.display(node.left, indent + "\t")
        self.display(node.right, indent + "\t")
        
    def predict_point(self, node, row):
        if node.result != None:
            return node.result
        
        if row[node.label] < node.pivot:
            return self.predict_point(node.left, row)
        else:
            return self.predict_point(node.right, row)
    
    
    def predict(self, X):
        
        result = []
        for index, row in X.iterrows():
            result.append(self.predict_point(self.root, row))
            
        return (np.array(result) > .5).astype(int)
            
        
    def score(self, X, y):
        yp = self.predict(X)
        return (yp == y).mean()

In [99]:
model = CustomDecisionTree()

In [100]:
model.fit(X_train, y_train)

In [101]:
model.root.right

Pclass : 2.4128205128205127

In [102]:
model.display(model.root)

 Sex : 0.6543624161073825
	 Pclass : 2.1941747572815533
		 SibSp : 0.514018691588785
			 Age : 33.09950766705985
				 Alive
				 Age : 43.15084033613444
					 Alive
					 Alive
			 Age : 29.644444444444446
				 SibSp : 1.2916666666666667
					 Alive
					 Alive
				 Alive
		 SibSp : 0.9797979797979798
			 Parch : 0.5555555555555556
				 Age : 22.243589743589745
					 Alive
					 Dead
				 Parch : 2.0
					 Alive
					 Alive
			 Parch : 1.1111111111111112
				 Age : 23.225
					 Alive
					 Dead
				 SibSp : 3.6666666666666665
					 Dead
					 Dead
	 Pclass : 2.4128205128205127
		 Parch : 0.20915032679738563
			 Pclass : 1.5
				 Age : 41.06155321782178
					 Dead
					 Dead
				 SibSp : 0.234375
					 Dead
					 Dead
			 Age : 29.06
				 Age : 11.192307692307692
					 Alive
					 Dead
				 Pclass : 1.25
					 Dead
					 Dead
		 Parch : 0.22784810126582278
			 Age : 28.509728039643676
				 SibSp : 0.17266187050359713
					 Dead
					 Dead
				 Age : 38.475806451612904
					 Dead
					 D

In [103]:
model.predict(X_test.iloc[:10])

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1])

In [104]:
y_test[:10]

709    1.0
439    0.0
840    0.0
720    1.0
39     1.0
290    1.0
300    1.0
333    0.0
208    1.0
136    1.0
Name: Survived, dtype: float64

In [105]:
model.score(X_test, y_test)

0.823728813559322