# PreProcessing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Train_Titanic.csv")
data_test = pd.read_csv("Test_Titanic.csv")
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
pclass       1009 non-null float64
survived     1009 non-null float64
name         1009 non-null object
sex          1009 non-null object
age          812 non-null float64
sibsp        1009 non-null float64
parch        1009 non-null float64
ticket       1009 non-null object
fare         1008 non-null float64
cabin        229 non-null object
embarked     1008 non-null object
boat         374 non-null object
body         98 non-null float64
home.dest    582 non-null object
dtypes: float64(7), object(7)
memory usage: 110.4+ KB


In [4]:
column_to_drop = ["boat","body","name","ticket","cabin","embarked","home.dest"]
data_clean = data.drop(column_to_drop, axis=1)
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null object
age         812 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1008 non-null float64
dtypes: float64(6), object(1)
memory usage: 55.3+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()
data_clean["sex"] = le.fit_transform(data_clean["sex"])

In [16]:
data_clean = data_clean.fillna(data_clean["age"].mean())
data_clean = data_clean.fillna(data_clean["fare"].mean())
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null int64
age         1009 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1009 non-null float64
dtypes: float64(6), int64(1)
memory usage: 55.3 KB


In [17]:
input_col = ["pclass","sex","age","sibsp","parch","fare"]
output_col = ["survived"]
X = data_clean[input_col]
Y = data_clean[output_col]
print(X.shape,Y.shape)

(1009, 6) (1009, 1)


# Implementation DT

In [18]:
def entropy(col):
    counts = np.unique(col, return_counts=True)
    N = float(col.shape[0])
    ent = 0.0
    
    for ix in counts[1]:
        p = ix/N
        ent += (-1.0*p*np.log2(p))
    return ent

In [19]:
def divide(x_data,fkey,fval):
    x_right = pd.DataFrame([],columns = x_data.columns)
    x_left = pd.DataFrame([],columns = x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val>fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
    return x_left,x_right

In [20]:
def information_gain(x_data,fkey,fval):
    left,right = divide(x_data,fkey,fval)
    
    l = float(left.shape[0]/x_data.shape[0])
    r = float(right.shape[0]/x_data.shape[0])
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000000
    i_gan = entropy(x_data.survived)-(l*entropy(left.survived)+r*entropy(right.survived))
    return i_gan

In [21]:
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.0010525742338489685
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


In [48]:
class DecisionTree:
    
    #Contructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth  = depth
        self.target = None
    
    def train(self,X_train):
        
        features = ["pclass","sex","age","sibsp","parch","fare"]
        info_gain = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gain.append(i_gain)
        
        self.fkey = features[np.argmax(info_gain)]
        self.fval = X_train[self.fkey].mean()
        print("Making Features is",self.fkey)
        
        #split 
        data_left,data_right = divide(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop = True)
        data_right = data_right.reset_index(drop = True)
        
        if data_left.shape[0] == 0 or data_right.shape[0] == 0 :
            if X_train.survived.mean() >= 0.5:
                self.target = 1.0
            else:
                self.target = 0.0
            return
        
        #Stop Early
        if(self.depth>= self.max_depth):
            if X_train.survived.mean() >= 0.5:
                self.target = 1.0
            else:
                self.target = 0.0
            return
        
        #Recursive Case
        
        self.left = DecisionTree(depth=self.depth+1,max_depth = self.max_depth)
        self.left.train(data_left)
        self.right = DecisionTree(depth=self.depth+1,max_depth = self.max_depth)
        self.right.train(data_right)
        
        
        if X_train.survived.mean() >= 0.5:
                self.target = 1.0
        else:
                self.target = 0.0
        return
    
    
    def predict(self,test):
        
        if test[self.fkey]>self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [23]:
split = int(0.7*data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean[split:]
test_data = test_data.reset_index(drop = True)

# Training of Model

In [49]:
dt = DecisionTree()
dt.train(train_data)

Making Features is sex
Making Features is pclass
Making Features is parch
Making Features is fare
Making Features is fare
Making Features is fare
Making Features is fare
Making Features is age
Making Features is age
Making Features is age
Making Features is fare
Making Features is pclass
Making Features is age
Making Features is age
Making Features is age
Making Features is age
Making Features is age
Making Features is sibsp
Making Features is fare
Making Features is fare
Making Features is parch
Making Features is age
Making Features is age
Making Features is age
Making Features is parch
Making Features is fare
Making Features is parch
Making Features is age
Making Features is fare
Making Features is fare
Making Features is age
Making Features is age
Making Features is fare
Making Features is parch
Making Features is sibsp
Making Features is age
Making Features is age
Making Features is pclass
Making Features is age
Making Features is age
Making Features is pclass
Making Features is f

In [26]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [27]:
y_val=y_pred
type(y_val)
np.sum(test_data["survived"]==y_val)/test_data["survived"].shape[0]

0.7557755775577558

# Test Data

In [None]:
test_data.info()

In [28]:
data_clean_test = data_test.drop(column_to_drop, axis=1)
data_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null object
age       234 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), object(1)
memory usage: 14.1+ KB


In [31]:
data_clean_test["sex"] = le.fit_transform(data_clean_test["sex"])
data_clean_test = data_clean_test.fillna(data_clean_test["age"].mean())
data_clean_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null int64
age       300 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), int64(1)
memory usage: 14.1 KB


In [50]:
y_pred_test = []
for ix in range(data_clean_test.shape[0]):
    y_pred_test.append(dt.predict(data_clean_test.loc[ix]))

In [51]:
print(y_pred_test)

[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,

In [52]:
pd.DataFrame(y_pred_test)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,1.0
6,0.0
7,0.0
8,0.0
9,0.0


In [35]:


import csv



In [53]:
with open('titanic_Submission_DT.csv','w', newline='') as f:
    fieldname = ['Id','survived']
    thewriter = csv.DictWriter(f,fieldnames = fieldname)
    thewriter.writeheader()
    for i in range(data_clean_test.shape[0]):
        thewriter.writerow({'Id':i,'survived':str(y_pred_test[i])})


# Decision Tree using SKLearn


In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
sk_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [41]:
sk_tree.fit(train_data[input_col],train_data[output_col])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [42]:
sk_tree.score(test_data[input_col],test_data[output_col])

0.7755775577557755

In [43]:
y_test = sk_tree.predict(data_clean_test[input_col])

In [45]:
np.sum(y_test==y_pred_test)/y_test.shape[0]

0.9433333333333334

In [46]:
pd.DataFrame(y_test)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,1.0
6,0.0
7,0.0
8,0.0
9,0.0


# Saving the Output into .csv format

In [47]:
with open('titanic_Submission.csv','w', newline='') as f:
    fieldname = ['Id','survived']
    thewriter = csv.DictWriter(f,fieldnames = fieldname)
    thewriter.writeheader()
    for i in range(data_clean_test.shape[0]):
        thewriter.writerow({'Id':i,'survived':str(y_test[i])})
