In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("titanic.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.shape

(891, 12)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Embarked"]

In [8]:
data_clean = data.drop(columns=columns_to_drop)

In [9]:
data_clean

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
data_clean['Sex'] =  le.fit_transform(data_clean['Sex'])

In [12]:
data_clean

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.2500
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.9250
3,1,1,0,35.0,1,0,53.1000
4,0,3,1,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000
887,1,1,0,19.0,0,0,30.0000
888,0,3,0,,1,2,23.4500
889,1,1,1,26.0,0,0,30.0000


In [13]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [14]:
data_clean['Age'].median()

28.0

In [15]:
data_clean['Age'] = data_clean['Age'].fillna(value= data_clean['Age'].median())

In [16]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [17]:
input_cols = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
output_cols = ['Survived']

In [18]:
X = data_clean[input_cols]
y = data_clean[output_cols]

### Entropy

In [19]:
def entropy(col):
#     unq, counts = np.unique(col, return_counts = True)
#     N = float(np.sum(counts))
    
#     ent = 0.0
    
#     for i in range(len(unq)):
#         p = counts[i]/N
#         ent += p*np.log2(p)
        
    
    unq, counts= np.unique(col, return_counts=True)
    probabs = counts/ np.sum(counts)
    ent  = np.sum(probabs * np.log2(probabs))
    
    return -ent

In [20]:
def divide_data(data, fkey, fval):
    x_right = pd.DataFrame([], columns= data.columns)
    x_left =  pd.DataFrame([], columns= data.columns)
    
    
    for i in range(data.shape[0]):
        val = data[fkey].loc[i]
        
        if val>=fval:
            x_right = x_right.append(data.loc[i])
        else:
            x_left = x_left.append(data.loc[i])
            
    return x_left, x_right

In [21]:
divide_data(data_clean[:10],'Sex', data_clean[:10]['Sex'].mean() )

(   Survived  Pclass  Sex   Age  SibSp  Parch     Fare
 1       1.0     1.0  0.0  38.0    1.0    0.0  71.2833
 2       1.0     3.0  0.0  26.0    0.0    0.0   7.9250
 3       1.0     1.0  0.0  35.0    1.0    0.0  53.1000
 8       1.0     3.0  0.0  27.0    0.0    2.0  11.1333
 9       1.0     2.0  0.0  14.0    1.0    0.0  30.0708,
    Survived  Pclass  Sex   Age  SibSp  Parch     Fare
 0       0.0     3.0  1.0  22.0    1.0    0.0   7.2500
 4       0.0     3.0  1.0  35.0    0.0    0.0   8.0500
 5       0.0     3.0  1.0  28.0    0.0    0.0   8.4583
 6       0.0     1.0  1.0  54.0    0.0    0.0  51.8625
 7       0.0     3.0  1.0   2.0    3.0    1.0  21.0750)

### Information gain

In [22]:
def information_gain(data,  fkey, fval):
    left, right = divide_data(data, fkey, fval)
    
    # left total points and right total points
    l = float( left.shape[0] / data.shape[0] ) 
    r = float( right.shape[0] / data.shape[0] )
    
    IG = entropy(data['Survived'])  - ( l*entropy(left['Survived']) + r*entropy(right['Survived']))
    
    return IG

In [23]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [24]:
for col in X.columns:
    ig = information_gain(data_clean, col, data_clean[col].mean() )
    print(col , ig)

Pclass 0.07579362743608165
Sex 0.2176601066606142
Age 0.0008836151229467681
SibSp 0.009584541813400071
Parch 0.015380754493137694
Fare 0.042140692838995464


## Decision Tree Class

In [25]:
class DecisionTree:
    
    def __init__(self, depth = 0, max_depth = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
        
        
    def train(self, data):
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        info_gains = []
        for f in features:
            ig = information_gain(data, f, data[f].mean())
            info_gains.append(ig)
        
        self.fkey = features[np.argmax(info_gains)]
        self.fval = data[self.fkey].mean()
        
        print(" Creating DT with feature - {}, depth - {}".format(self.fkey, self.depth))
        
        data_left, data_right = divide_data(data, self.fkey, self.fval)
        
        data_left = data_left.reset_index(drop = True)
        data_right = data_right.reset_index(drop = True)
        
    
    
        ######### RECURSION ##########
        
        # case - 1 Actually a Pure Node
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            
            self.target = data['Survived'].iloc[0]
              
            return 
        
        # case - 2 Max-depth => Early Stoppinh
        if (self.depth >= self.max_depth):
            
            if data['Survived'].mean() >= 0.5 :
                self.target = 1
            else:
                self.target = 0
                
            return 
        
        # recursion
        
        self.left = DecisionTree(depth=self.depth + 1, max_depth= self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth= self.depth + 1, max_depth= self.max_depth)
        self.right.train(data_right)
        
        
        # bcoz i want to give target to each node
        if data['Survived'].mean() >= 0.5 :
            self.target = 1
        else:
            self.target = 0

        return 
    
    
    
    def predict(self, test):
        
        if test[self.fkey] > self.fval:
            # right side
            if self.right is None:
                return self.target
            
            return self.right.predict(test)
            
        else:
            # left side
            if self.left is None:
                return self.target
            
            return self.left.predict(test)
            

In [26]:
data_clean.shape

(891, 7)

In [27]:
split_val = int(X.shape[0] * 0.7)

In [28]:
train_data = data_clean[:split_val]
test_data = data_clean[split_val:]

In [29]:
train_data.shape

(623, 7)

In [30]:
test_data.shape

(268, 7)

In [31]:
dt = DecisionTree()

In [32]:
dt.train(train_data)

 Creating DT with feature - Sex, depth - 0
 Creating DT with feature - Pclass, depth - 1
 Creating DT with feature - Age, depth - 2
 Creating DT with feature - SibSp, depth - 3
 Creating DT with feature - Pclass, depth - 4
 Creating DT with feature - Pclass, depth - 5
 Creating DT with feature - Age, depth - 5
 Creating DT with feature - SibSp, depth - 4
 Creating DT with feature - Parch, depth - 5
 Creating DT with feature - Pclass, depth - 5
 Creating DT with feature - SibSp, depth - 3
 Creating DT with feature - Fare, depth - 4
 Creating DT with feature - Parch, depth - 5
 Creating DT with feature - Pclass, depth - 5
 Creating DT with feature - Pclass, depth - 4
 Creating DT with feature - Pclass, depth - 5
 Creating DT with feature - Pclass, depth - 5
 Creating DT with feature - Parch, depth - 2
 Creating DT with feature - Age, depth - 3
 Creating DT with feature - Fare, depth - 4
 Creating DT with feature - Fare, depth - 5
 Creating DT with feature - SibSp, depth - 5
 Creating DT 

In [33]:
dt.left.fkey

'Pclass'

## Predict

In [34]:
y_pred = []


for i in range(test_data.shape[0]):
    p = dt.predict(test_data.iloc[i])
    y_pred.append(p)

In [42]:
y_pred = np.array(y_pred, dtype= "uint")

In [41]:
y_test = test_data['Survived'].values

In [45]:
(y_pred == y_test).sum()/y_test.shape[0]

0.8171641791044776

# Decision Tree with sklearn

In [46]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [51]:
model = DecisionTreeClassifier(max_depth=5, )

In [52]:
model.fit(train_data.drop(columns=['Survived']), train_data['Survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [53]:
model.score(test_data.drop(columns=['Survived']), test_data['Survived'])

0.8171641791044776

In [50]:
model.score(train_data.drop(columns=['Survived']), train_data['Survived'])

0.9807383627608347