In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.head(n=10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,
5,2.0,0.0,"Gaskell, Mr. Alfred",male,16.0,0.0,0.0,239865,26.0,,S,,,"Liverpool / Montreal, PQ"
6,2.0,0.0,"Phillips, Mr. Escott Robert",male,43.0,0.0,1.0,S.O./P.P. 2,21.0,,S,,,"Ilfracombe, Devon"
7,1.0,1.0,"Leader, Dr. Alice (Farnham)",female,49.0,0.0,0.0,17465,25.9292,D17,S,8,,"New York, NY"
8,1.0,0.0,"Brandeis, Mr. Emil",male,48.0,0.0,0.0,PC 17591,50.4958,B10,C,,208.0,"Omaha, NE"
9,2.0,0.0,"Wheeler, Mr. Edwin ""Frederick""",male,,0.0,0.0,SC/PARIS 2159,12.875,,S,,,


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
pclass       1009 non-null float64
survived     1009 non-null float64
name         1009 non-null object
sex          1009 non-null object
age          812 non-null float64
sibsp        1009 non-null float64
parch        1009 non-null float64
ticket       1009 non-null object
fare         1008 non-null float64
cabin        229 non-null object
embarked     1008 non-null object
boat         374 non-null object
body         98 non-null float64
home.dest    582 non-null object
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [4]:
# Clean Train and test data

features_to_drop = ['name', 'ticket', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
df_train = df_train.drop(columns=features_to_drop)
df_test = df_test.drop(columns=features_to_drop)

ua = df_train['age'].mean()
uf = df_train['fare'].mean()

values = {'age': ua, 'fare': uf}
df_train.fillna(values, inplace=True)
df_test.fillna(values, inplace=True)

le = LabelEncoder()
df_train['sex'] = le.fit_transform(df_train['sex'])
df_test['sex'] = le.transform(df_test['sex'])

In [7]:
df_train.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null int32
age       300 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), int32(1)
memory usage: 13.0 KB


In [8]:
inp_cols = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
out_col = ['survived']

In [9]:
# Entropy of a system
def entropy(S):
    
    arr = np.unique(S, return_counts=True)[1]
    N = float(S.shape[0])
    
    ent = 0.0
    for ix in arr:
        ent += (ix/N)*np.log2(ix/N)
    return -ent

# Information Gain
def info_gain(S, A, threshold):
    
    ig = entropy(S)
    m = A.shape[0]
    
    S_ = S[(A <= threshold)]
    ig -= (S_.shape[0]/m) * entropy(S_)
    S_ = S[(A > threshold)]
    ig -= (S_.shape[0]/m) * entropy(S_)
    
    return ig

In [10]:
for fx in inp_cols:
    print(fx)
    print(info_gain(df_train[out_col], df_train[fx], df_train[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.001955929827451075
sibsp
0.006492394392888956
parch
0.019756080122948216
fare
0.042427934014281715


In [11]:
class DecisionTree:
    
    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.target = None
        self.max_depth = max_depth
        self.depth = depth
    
    def train(self, df_train):
        
        info_gains = []
        for fx in inp_cols:
            ig = info_gain(df_train[out_col], df_train[fx], df_train[fx].mean())
            info_gains.append(ig)
        
        self.fkey = inp_cols[np.argmax(info_gains)]
        self.fval = df_train[self.fkey].mean()
        
        # Split dataset
        df_left = df_train[(df_train[self.fkey] <= self.fval)]
        df_left = df_left.reset_index(drop=True)
        
        df_right = df_train[(df_train[self.fkey] > self.fval)]
        df_right = df_right.reset_index(drop=True)
        
        # Set target at every node
        if df_train[out_col].mean()[0] >= 0.5:
            self.target = 1
        else:
            self.target = 0
        
        # Base cases
        # 1. Leaf Node
        if df_left.shape[0] == 0 or df_right.shape[0] == 0:
            return
        
        # 2. Stop early when depth >= max_depth
        if self.depth >= self.max_depth:
            return
        
        # Recursive case
        self.left = DecisionTree(depth=self.depth+1, max_depth=self.max_depth)
        self.left.train(df_left)
        self.right = DecisionTree(depth=self.depth+1, max_depth=self.max_depth)
        self.right.train(df_right)
        return
    
    
    def predict(self, test):
        if int(test[self.fkey]) > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [12]:
# Create decision tree class object and train
dt = DecisionTree()
dt.train(df_train)

In [13]:
print(dt.fkey, dt.fval)
print(dt.left.fkey)
print(dt.right.fkey)

sex 0.6422200198216056
pclass
fare


In [14]:
y_pred = []
test = df_train.drop(out_col, axis=1)
for ix in range(test.shape[0]):
    y_pred.append(dt.predict(test.loc[ix]))

In [15]:
acc = (np.array(y_pred).reshape((-1, 1)) == df_train[out_col]).mean()
print(acc)

survived    0.801784
dtype: float64


## Decision Tree Using Sklearn

In [16]:
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=2)
dt_clf.fit(df_train[inp_cols], df_train[out_col])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [17]:
# Testing and scoring

print(dt_clf.score(df_train[inp_cols], df_train[out_col]))
y_pred = dt_clf.predict(df_test[inp_cols])

df_pred=pd.DataFrame(dt_clf.predict(df_test[inp_cols]),columns=['survived'])
df_pred.to_csv("y_pred.csv",index_label='Id')

0.8275520317145689
