# Titanic Bayes

In [2]:
#import libraries
import numpy as np
import pandas as pd
import sklearn

from sklearn.naive_bayes import GaussianNB   
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
#load data
filename = "datasets/titanic.xls"
df = pd.read_excel(filename)

#preview data
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
#get total number of missing values for each column
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

We're gonna clean up the age and embarked columns

In [7]:
#rows where the age is missing
missing_age = df.loc[df['age'].isnull()]
missing_age.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S,,,"New York, NY"
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S,9.0,,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C,,,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S,,,
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",female,,0,0,17770,27.7208,,C,5.0,,"New York, NY"


In [8]:
#table of avg age of passenger by survival status, sex, and passenger class
df.groupby(['survived', 'sex', 'pclass'])['age'].mean()

survived  sex     pclass
0         female  1         35.200000
                  2         34.090909
                  3         23.418750
          male    1         43.658163
                  2         33.092593
                  3         26.679598
1         female  1         37.109375
                  2         26.711051
                  3         20.814815
          male    1         36.168240
                  2         17.449274
                  3         22.436441
Name: age, dtype: float64

In [9]:
#fill missing values for age based on survival status, sex, and passenger class
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

In [10]:
#get index numbers of missing rows - we'll use this later
mals = list(missing_age.index)

In [11]:
#verify filled missing values 
df.iloc[mals].head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,43.658163,0,0,PC 17318,25.925,,S,,,"New York, NY"
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,36.16824,0,0,111427,26.55,,S,9.0,,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,43.658163,0,0,112379,39.6,,C,,,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,43.658163,0,0,113798,31.0,,S,,,
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",female,37.109375,0,0,17770,27.7208,,C,5.0,,"New York, NY"


In [12]:
#verify there are no more missing age values
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [13]:
#missing values for 'embarked'
embark = df.loc[df['embarked'].isnull()]
embark

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,6,,"Cincinatti, OH"


In [14]:
#save index fo rmissing values to verify later
embarkls = list(embark.index)

In [15]:
#only 2 missing values so we'll fill with most common embarkation point
df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [16]:
#fill missing values
df['embarked'].fillna('S', inplace=True)

In [17]:
#check that they're filled
df.iloc[embarkls]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S,6,,"Cincinatti, OH"


In [18]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

In [19]:
#drop columns we're not gonna use in the model
modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [20]:
modeldf.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,embarked
0,1,1,female,29.0,0,0,S
1,1,1,male,0.9167,1,2,S
2,1,0,female,2.0,1,2,S
3,1,0,male,30.0,1,2,S
4,1,0,female,25.0,1,2,S


In [21]:
#Create dummy variables for categorical values
#dummy variables for passenger class embarkation port
#get_dummies will auto-drop columns that dummies were created from
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
modeldf.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,female,29.0,0,0,1,0,0,0,0,1
1,1,male,0.9167,1,2,1,0,0,0,0,1
2,0,female,2.0,1,2,1,0,0,0,0,1
3,0,male,30.0,1,2,1,0,0,0,0,1
4,0,female,25.0,1,2,1,0,0,0,0,1


In [22]:
#change sex values to binary
#female=0, male=1
modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})
modeldf.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,0,29.0,0,0,1,0,0,0,0,1
1,1,1,0.9167,1,2,1,0,0,0,0,1
2,0,0,2.0,1,2,1,0,0,0,0,1
3,0,1,30.0,1,2,1,0,0,0,0,1
4,0,0,25.0,1,2,1,0,0,0,0,1


In [23]:
#create new column based on number of family members
#drop sibsp and parch columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)
modeldf.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num
0,1,0,29.0,1,0,0,0,0,1,0
1,1,1,0.9167,1,0,0,0,0,1,3
2,0,0,2.0,1,0,0,0,0,1,3
3,0,1,30.0,1,0,0,0,0,1,3
4,0,0,25.0,1,0,0,0,0,1,3


In [25]:
#create a column for people traveling alone
modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)
modeldf.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num,TravelAlone
0,1,0,29.0,1,0,0,0,0,1,0,1
1,1,1,0.9167,1,0,0,0,0,1,3,0
2,0,0,2.0,1,0,0,0,0,1,3,0
3,0,1,30.0,1,0,0,0,0,1,3,0
4,0,0,25.0,1,0,0,0,0,1,3,0


In [26]:
#Split data into train and test

#extract target variable
#make copy of 'survived' column
y = modeldf['survived']

#copy of modeldf without 'survived' column
X = modeldf.drop(['survived'], axis=1)

In [28]:
#80% for training data, 20% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=80)

### Gaussian Naïve Bayes

In [29]:
#initialize Gaussian Bayes classifier
gnb = GaussianNB()

In [30]:
#train the model to learn trends
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [31]:
#predictive score of the model on the training data
gnb.score(X_train, y_train)

0.778414517669532

In [32]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [33]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survived', 'Predicted Survived'],
    index=['True Not Survived', 'True Survived']
)

cm

Unnamed: 0,Predicted Not Survived,Predicted Survived
True Not Survived,143,23
True Survived,27,69


In [34]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.8091603053435115

In [35]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       166
           1       0.75      0.72      0.73        96

   micro avg       0.81      0.81      0.81       262
   macro avg       0.80      0.79      0.79       262
weighted avg       0.81      0.81      0.81       262



### Bernoulli's Naïve Bayes

In [37]:
#initialize Bernoulli Naïve Bayes function to a variable
bnb = BernoulliNB()

In [38]:
#build the model with training data
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [39]:
#model's predictive score on the training data
bnb.score(X_train, y_train)

0.7468958930276982

In [40]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [41]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survived', 'Predicted Survived'],
    index=['True Not Survived', 'True Survived']
)

cm

Unnamed: 0,Predicted Not Survived,Predicted Survived
True Not Survived,143,23
True Survived,27,69


In [42]:
#predictive score of the model on the test data
bnb.score(X_test, y_test)

0.7824427480916031

In [43]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       166
           1       0.75      0.72      0.73        96

   micro avg       0.81      0.81      0.81       262
   macro avg       0.80      0.79      0.79       262
weighted avg       0.81      0.81      0.81       262



The Gaussian model seems to perform better. The decision tree seems to be the best predictor of survival. 