#### Titanic survivor classifier using decision tree

In [20]:
### Extracting the data
import pandas as pds

unstrData = pds.read_csv('Train.csv')
unstrData.head(n=10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,
5,2.0,0.0,"Gaskell, Mr. Alfred",male,16.0,0.0,0.0,239865,26.0,,S,,,"Liverpool / Montreal, PQ"
6,2.0,0.0,"Phillips, Mr. Escott Robert",male,43.0,0.0,1.0,S.O./P.P. 2,21.0,,S,,,"Ilfracombe, Devon"
7,1.0,1.0,"Leader, Dr. Alice (Farnham)",female,49.0,0.0,0.0,17465,25.9292,D17,S,8,,"New York, NY"
8,1.0,0.0,"Brandeis, Mr. Emil",male,48.0,0.0,0.0,PC 17591,50.4958,B10,C,,208.0,"Omaha, NE"
9,2.0,0.0,"Wheeler, Mr. Edwin ""Frederick""",male,,0.0,0.0,SC/PARIS 2159,12.875,,S,,,


In [21]:
### Feature selection
columns_to_remove = ["name","ticket","embarked","boat","body","home.dest","cabin","fare"]
cleanedData = unstrData.drop(columns_to_remove,axis=1)
cleanedData.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch
count,1009.0,1009.0,812.0,1009.0,1009.0
mean,2.295342,0.378593,29.838978,0.521308,0.39445
std,0.835704,0.485277,14.540244,1.077269,0.888087
min,1.0,0.0,0.3333,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0
50%,3.0,0.0,28.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,0.0
max,3.0,1.0,80.0,8.0,9.0


In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cleanedData["sex"] = le.fit_transform(cleanedData["sex"])
#remove the nan values
cleanedData = cleanedData.fillna(cleanedData["age"].mean())
cleanedData.head(n=10)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch
0,3.0,0.0,0,29.838978,0.0,0.0
1,2.0,0.0,1,39.0,0.0,0.0
2,2.0,1.0,0,40.0,0.0,0.0
3,3.0,1.0,0,31.0,1.0,1.0
4,3.0,1.0,0,29.838978,2.0,0.0
5,2.0,0.0,1,16.0,0.0,0.0
6,2.0,0.0,1,43.0,0.0,1.0
7,1.0,1.0,0,49.0,0.0,0.0
8,1.0,0.0,1,48.0,0.0,0.0
9,2.0,0.0,1,29.838978,0.0,0.0


#### Decision tree module sklearn

In [48]:
input_cols = ['pclass',"sex","age","sibsp","parch"]
output_cols = ["survived"]
Y = cleanedData['survived']
X = cleanedData[input_cols]
# print(X.shape,Y.shape)

from sklearn.tree import DecisionTreeClassifier
de = DecisionTreeClassifier(max_depth=5,random_state=3)
de.fit(X,Y)

print("The accuracy of the model using DecisionTreeClassifier {}".format((de.predict(X)==Y).sum()/Y.shape[0]))

The accuracy of the model using DecisionTreeClassifier 0.8166501486620417


In [50]:
from sklearn.ensemble import RandomForestClassifier
re = RandomForestClassifier(criterion='gini',n_estimators=20,max_depth=10)
re.fit(X,Y)
print("The accuracy of the model using RandomForestClassifier {}".format((re.predict(X)==Y).sum()/Y.shape[0]))

The accuracy of the model using RandomForestClassifier 0.8800792864222002


#### Custum decision tree class

In [61]:
import numpy as np

def entropy(col):
    
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p  = ix/N
        ent += (-1.0*p*np.log2(p))
    
    return ent

def divide_data(x_data,fkey,fval):
    #Work with Pandas Data Frames
    x_right = pds.DataFrame([],columns=x_data.columns)
    x_left = pds.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left,x_right

def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    #% of total samples are on left and right
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    #All examples come to one side!
    if left.shape[0] == 0 or right.shape[0] ==0:
        return -1000000 #Min Information Gain
    
    i_gain = entropy(x_data.survived) - (l*entropy(left.survived)+r*entropy(right.survived))
    return i_gain

class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self,X_train):
        
        features = ['pclass','sex','age','sibsp', 'parch']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        print("Making Tree Features is",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #Truly a left node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #You can set the target at every node
        if X_train.survived.mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [62]:
split = int(0.7*cleanedData.shape[0])
train_data = cleanedData[:split]
test_data = cleanedData[split:]
test_data = test_data.reset_index(drop=True)

In [63]:
dt = DecisionTree()
dt.train(train_data)

Making Tree Features is sex
Making Tree Features is pclass
Making Tree Features is parch
Making Tree Features is pclass
Making Tree Features is sibsp
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is pclass
Making Tree Features is age
Making Tree Features is age
Making Tree Features is pclass
Making Tree Features is age
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is sibsp
Making Tree Features is parch
Making Tree Features is age
Making Tree Features is age
Making Tree Features is parch
Making Tree Features is pclass
Making Tree

In [64]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))
print(y_pred)

['Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Survive', 'Survive', 'Survive', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Survive', 'Dead', 'Survive', 'Dead', 'Dead', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Survive', 'Survive', 'Dead', 'Dead', 'Dead', 'Survive', 'Dead', 'Survive', 'Dead', 'Survive', 'Dead', 