In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split

In [35]:
df = pd.read_csv("../datasets/titanic/train.csv")

In [36]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [37]:
data = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 41.8+ KB


In [39]:
le = LabelEncoder()
data["Sex"] = le.fit_transform(data["Sex"])

In [40]:
out = data.dropna().groupby(["Sex", "Pclass"]).mean()

In [41]:
sheet = out.reset_index()

In [42]:
sheet.loc[(sheet["Sex"]==0) & (sheet["Pclass"]==1)].iloc[0, 3]

34.61176470588235

In [43]:
def set_age(person):
    
    if person.isna()["Age"]:
        person["Age"] =  sheet.loc[(sheet["Sex"]==person["Sex"]) & (sheet["Pclass"]==person["Pclass"])].iloc[0, 3]
        
    return person

In [56]:
result = data.apply(set_age, axis=1)

In [57]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null float64
Pclass      891 non-null float64
Sex         891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
dtypes: float64(6)
memory usage: 41.8 KB


In [58]:
result["Age"] = np.array(result["Age"]/10, dtype=int)

In [59]:
X = result.drop(["Survived"], axis=1)

In [60]:
y = result["Survived"]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [70]:
entropy(y)

0.9607079018756469

In [82]:
for col in X.columns:
    print(col, info_gain(X, y, col))

Pclass 0.07579362743608165
Sex 0.2176601066606142
Age 0.0011056067250906354
SibSp 0.009584541813400071
Parch 0.015380754493137583


In [83]:
entropy([])

-0.0

In [93]:
class Node:
    
    def __init__(self, label=None, pivot=None, result=None):
        self.label = label
        self.pivot = pivot
        self.result = result
        
        self.left = None
        self.right = None
        
    def __repr__(self):
        
        if self.label == None:
            return str(self.result > 0.5)
        else:
            return "{} : {}".format(self.label, self.pivot)
    

In [94]:
def entropy(y):
    
    data = np.array(y)
    items, counts = np.unique(data, return_counts=True)
    probabs = counts / data.size
    log_probabs = np.log2(probabs)
    return -np.sum(probabs * log_probabs)

def info_gain(X, y, label):
    
    pivot = X[label].mean()
    
    left_cut = X[label] < pivot
    right_cut = X[label] >= pivot
    
    y_left, y_right = y.loc[left_cut], y.loc[right_cut]
    
    p_left = len(y_left)/len(y)
    p_right= 1 - p_left
    
    return entropy(y) - (p_left * entropy(y_left) + p_right*entropy(y_right))

class CustomDecisionTree:
    
    def __init__(self, max_depth=5):
        self.root = None
        self.max_depth = max_depth
    
    def fit(self, X, y):
        
        self.root = self.rec_fit(X, y, self.max_depth)
        
    def rec_fit(self, X, y, max_depth):
        
        if max_depth == 0:
            node = Node(result=y.mean())
            return node
        
        gains = []
        for col in X.columns:
            gains.append((info_gain(X, y, col), col))
        
        sorted_gains = sorted(gains, reverse=True)
        selected_gain, selected_col = sorted_gains[0]
        
        if selected_gain <= 0:
            node = Node(result=y.mean())
            return node
        
        pivot = X[selected_col].mean()
    
        left_cut = X[selected_col] < pivot
        right_cut = X[selected_col] >= pivot

        X_left, X_right = X.loc[left_cut], X.loc[right_cut]
        y_left, y_right = y.loc[left_cut], y.loc[right_cut]
        
        node = Node(label=selected_col, pivot=pivot)
        node.left = self.rec_fit(X_left, y_left, max_depth-1)
        node.right = self.rec_fit(X_right, y_right, max_depth-1)
        
        return node
    
    def display(self):
        self.rec_display(self.root, "")
        
    def rec_display(self, node, indent):
        if node == None:
            return
        
        print(indent, node)
        self.rec_display(node.left, indent + "\t")
        self.rec_display(node.right, indent + "\t")
        
        
        
        

In [101]:
model = CustomDecisionTree(max_depth=3)

In [102]:
model.fit(X_train, y_train)

In [103]:
model.display()

 Sex : 0.6543624161073825
	 Pclass : 2.1941747572815533
		 SibSp : 0.514018691588785
			 True
			 True
		 SibSp : 0.9797979797979798
			 True
			 False
	 Pclass : 2.4128205128205127
		 Parch : 0.20915032679738563
			 False
			 False
		 Parch : 0.22784810126582278
			 False
			 False
