# Bayes' Theorem classification application on Titanic Data

We predict if a passenger is survived or not using __Bayes' theorem__. We use __GaussianNB__ library in __Scikit Library__ for this purpose.
<font color=red>
- In this program, I've included not only 'Sex', but also other simple features Pclass, SibSp, Parch, Embarked for fitting into the model (of course without much investigation into these features like if they are skewed or not, etc.)<br/>
- I've also not included unrelated features such as PassengerId, Name, Ticket.
- I could have included features such as Fare(after bucketing) and Cabin (after imputing, bucketing) but I've not to simplify the effort.
    </font>

In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [99]:
titanic_train = pd.read_csv('Data/titanic_train.csv')
titanic_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
528,529,0,3,"Salonen, Mr. Johan Werner",male,39.0,0,0,3101296,7.925,,S
145,146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S
442,443,0,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C
382,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S
162,163,0,3,"Bengtsson, Mr. John Viktor",male,26.0,0,0,347068,7.775,,S
273,274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7,C118,C
763,764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36.0,1,2,113760,120.0,B96 B98,S
428,429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q


In [90]:
titanic_train.shape

(891, 12)

In [100]:
# We don't use all the columns but 'Sex' and 'Survived'
titanic_train = titanic_train[['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked', 'Survived']]
titanic_train.head()

Unnamed: 0,Sex,Pclass,SibSp,Parch,Embarked,Survived
0,male,3,1,0,S,0
1,female,1,1,0,C,1
2,female,3,0,0,S,1
3,female,1,1,0,S,1
4,male,3,0,0,S,0


In [101]:
type(titanic_train['Sex'])

pandas.core.series.Series

In [102]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Sex         891 non-null object
Pclass      891 non-null int64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    889 non-null object
Survived    891 non-null int64
dtypes: int64(4), object(2)
memory usage: 41.9+ KB


In [103]:
# Let us convert the feature 'Sex' into 'Category'
titanic_train['Sex'] = titanic_train['Sex'].astype('category', copy=False).cat.codes

In [104]:
# Let us convert the feature 'Sex' into 'Category'
titanic_train['Embarked'] = titanic_train['Embarked'].astype('category', copy=False).cat.codes

In [95]:
type(titanic_train['Sex'])

pandas.core.series.Series

In [96]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Sex         891 non-null int8
Pclass      891 non-null int64
SibSp       891 non-null int64
Parch       891 non-null int64
Survived    891 non-null int64
dtypes: int64(4), int8(1)
memory usage: 28.8 KB


In [97]:
titanic_train.head()

Unnamed: 0,Sex,Pclass,SibSp,Parch,Survived
0,1,3,1,0,0
1,0,1,1,0,1
2,0,3,0,0,1
3,0,1,1,0,1
4,1,3,0,0,0


In [105]:
# Let us check for null values, if any
titanic_train.isnull().any()

Sex         False
Pclass      False
SibSp       False
Parch       False
Embarked    False
Survived    False
dtype: bool

In [54]:
# Let us drop the samples having 'null' values
titanic_train = titanic_train.dropna()

In [55]:
# Let us check the dimensions
titanic_train.shape

(891, 3)

In [106]:
# Let us separate the data into 'features' and 'Label'
# Note, we are leaving the 'Label' in features set purposefully as we need to calculate apriori probabilities
features = titanic_train[['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']]
label = titanic_train['Survived']

In [107]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2)

In [108]:
X_train.shape, X_test.shape

((712, 5), (179, 5))

In [109]:
model = GaussianNB()

In [110]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [111]:
y_pred = model.predict(X_test)

In [112]:
accuracy_score(y_test, y_pred)

0.8100558659217877

In [113]:
# Let us predict the 'Survivals' on actual test set
# Read titanic_test dataset for which no Survival information is known and to be found
titanic_test = pd.read_csv('Data/titanic_test.csv')
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [114]:
Actual_test = titanic_test[['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']]
Actual_test['Sex'] = titanic_test['Sex'].astype('category', copy=False).cat.codes
Actual_test['Embarked'] = titanic_test['Embarked'].astype('category', copy=False).cat.codes
Actual_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Sex,Pclass,SibSp,Parch,Embarked
0,1,3,0,0,1
1,0,3,1,0,2
2,1,2,0,0,1
3,1,3,0,0,2
4,0,3,1,1,2
...,...,...,...,...,...
413,1,3,0,0,2
414,0,1,0,0,0
415,1,3,0,0,2
416,1,3,0,0,2


In [115]:
Actual_pred = model.predict(Actual_test)

In [116]:
Actual_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [117]:
df1 = titanic_test

In [118]:
df2 = pd.DataFrame(Actual_pred, columns=['Survived'])

In [119]:
res = pd.concat([df1, df2], axis=1)
res

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0
