In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree

In [2]:
dataset = sns.load_dataset('titanic')
dataset

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [7]:
# Note that the survived column and alive column is consider as closely related variable
# For example, if the passanger is survived, most probably they are still alive (1), while not survived, the alive column definitely is no.
# Therefore, the column 'alive' will be removed.
# Same to column 'sex' and 'who' and 'adult_male'

data = dataset.drop(columns=['alive','who','adult_male'])
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,False
1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,False
2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,True
3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,False
4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,True
887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,True
888,0,3,female,,1,2,23.4500,S,Third,,Southampton,False
889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,True


In [6]:
data['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [5]:
data['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [8]:
data['deck'].unique()

[NaN, 'C', 'E', 'G', 'D', 'A', 'B', 'F']
Categories (7, object): ['C', 'E', 'G', 'D', 'A', 'B', 'F']

In [9]:
categorical_col = ['sex','embarked','class','alone','embark_town','deck']
data[categorical_col] = data[categorical_col].astype(str)
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,False
1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,False
2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,True
3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,False
4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,True
887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,True
888,0,3,female,,1,2,23.4500,S,Third,,Southampton,False
889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,True


In [10]:
le = preprocessing.LabelEncoder()
for col in categorical_col:
  data[col] = le.fit_transform(data[col])

data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,3,1,22.0,1,0,7.2500,2,2,7,2,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,0
2,1,3,0,26.0,0,0,7.9250,2,2,7,2,1
3,1,1,0,35.0,1,0,53.1000,2,0,2,2,0
4,0,3,1,35.0,0,0,8.0500,2,2,7,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2,1,7,2,1
887,1,1,0,19.0,0,0,30.0000,2,0,1,2,1
888,0,3,0,,1,2,23.4500,2,2,7,2,0
889,1,1,1,26.0,0,0,30.0000,0,0,2,0,1


In [12]:
# check if any NaN
data.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         0
class            0
deck             0
embark_town      0
alone            0
dtype: int64

In [13]:
# Impute the age with mean value
data['age']=data['age'].replace({np.nan:data['age'].mean()})
data.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
deck           0
embark_town    0
alone          0
dtype: int64

In [15]:
# check the data imputation
data['age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [16]:
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)

print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)

Shape of training data : (713, 12)
Shape of testing data : (178, 12)


In [17]:
train_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
495,0,3,1,29.699118,0,0,14.4583,0,2,7,0,1
648,0,3,1,29.699118,0,0,7.5500,2,2,7,2,1
278,0,3,1,7.000000,4,1,29.1250,1,2,7,1,0
31,1,1,0,29.699118,1,0,146.5208,0,0,1,0,0
255,1,3,0,29.000000,0,2,15.2458,0,2,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
25,1,3,0,38.000000,1,5,31.3875,2,2,7,2,0
110,0,1,1,47.000000,0,0,52.0000,2,0,2,2,1
149,0,2,1,42.000000,0,0,13.0000,2,1,7,2,1
152,0,3,1,55.500000,0,0,8.0500,2,2,7,2,1


In [18]:
# Since the survived column is the variable that we want to predict
# Thus, we make it as train label and test label
train_x = train_data.drop(columns=['survived'],axis=1)
train_y = train_data['survived']

test_x = test_data.drop(columns=['survived'],axis=1)
test_y = test_data['survived']

In [19]:
train_x

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
495,3,1,29.699118,0,0,14.4583,0,2,7,0,1
648,3,1,29.699118,0,0,7.5500,2,2,7,2,1
278,3,1,7.000000,4,1,29.1250,1,2,7,1,0
31,1,0,29.699118,1,0,146.5208,0,0,1,0,0
255,3,0,29.000000,0,2,15.2458,0,2,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...
25,3,0,38.000000,1,5,31.3875,2,2,7,2,0
110,1,1,47.000000,0,0,52.0000,2,0,2,2,1
149,2,1,42.000000,0,0,13.0000,2,1,7,2,1
152,3,1,55.500000,0,0,8.0500,2,2,7,2,1


In [21]:
gini_decision_tree_model = tree.DecisionTreeClassifier(criterion='gini')
gini_decision_tree_model.fit(train_x, train_y)
gini_decision_tree_model.score(train_x, train_y)

0.9915848527349228

In [22]:
# Predict if the passanger is survived
prediction = gini_decision_tree_model.predict(test_x)
prediction

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1])

In [23]:
# check accuracy classification score
gini_test_accuracy = accuracy_score(test_y,prediction)
print('Gini test accuracy: ', gini_test_accuracy)

Test accuracy:  0.7808988764044944


In [24]:
entropy_decision_tree_model = tree.DecisionTreeClassifier(criterion='entropy')
entropy_decision_tree_model.fit(train_x, train_y)
entropy_decision_tree_model.score(train_x, train_y)
entropy_prediction = entropy_decision_tree_model.predict(test_x)
entropy_prediction

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1])

In [25]:
entropy_test_accuracy = accuracy_score(test_y,entropy_prediction)
print('Entropy test accuracy: ', entropy_test_accuracy)

Entropy test accuracy:  0.7752808988764045


The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. In overall, the criterion that using **Gini is slightly better than entropy based on test accuracy**.