In [1]:
# import the library 
import pandas as pd
titanic = pd.read_csv('./titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# explore the data to estimate if we have enough (statistically relevant) data for both classes
titanic.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


In [3]:
# We drop clearly irrelevant attributes. Pay attention for bias! Don't let your own opinion play. 
titanic = titanic.drop(['PassengerId','Name','Ticket','Fare','Cabin','Embarked'],axis=1)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [4]:
print('Before')
print(titanic.count())
print()

# drop all lines that contain empty (null or NaN) values
titanic = titanic.dropna()

print('After')
print(titanic.count())

Before
Survived    891
Pclass      891
Sex         891
Age         714
SibSp       891
Parch       891
dtype: int64

After
Survived    714
Pclass      714
Sex         714
Age         714
SibSp       714
Parch       714
dtype: int64


In [5]:
# see what remains
titanic.groupby('Survived').count()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,424,424,424,424,424
1,290,290,290,290,290


In [6]:
import numpy as np
titanic['Sex'] = np.where(titanic['Sex']>='male', 1, 2)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,2,38.0,1,0
2,1,3,2,26.0,0,0
3,1,1,2,35.0,1,0
4,0,3,1,35.0,0,0


In [7]:
from sklearn.model_selection import train_test_split
X = titanic.drop('Survived',axis=1)
y = titanic['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300)
model.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
y_test2 = model.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test2)

0.7767441860465116

In [11]:
print(X_train.columns)
print(model.feature_importances_)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')
[0.1719585  0.29684675 0.41942283 0.06475223 0.0470197 ]


In [12]:
# we now combine those two collections into a dataframe
pd.DataFrame(model.feature_importances_,columns=['Importance'],index=X_train.columns).sort_values(by='Importance',ascending=False)

Unnamed: 0,Importance
Age,0.419423
Sex,0.296847
Pclass,0.171959
SibSp,0.064752
Parch,0.04702


In [13]:
# Determine the false negative rate: what's the proportion of the passengers 
# who survived that we declared death. 
results = pd.DataFrame({'true':y_test,'estimated':y_test2})

results['TP'] = np.where((results['true'] == 1) & (results['estimated'] == 1),1,0)
results['TN'] = np.where((results['true'] == 0) & (results['estimated'] == 0),1,0)
results['FP'] = np.where((results['true'] == 0) & (results['estimated'] == 1),1,0)
results['FN'] = np.where((results['true'] == 1) & (results['estimated'] == 0),1,0)

FNrate = results['FN'].sum()/(results['FN'].sum() + results['TP'].sum())
print(FNrate)

0.25555555555555554
