In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
titanic_df = pd.read_csv("data/titanic.csv")
titanic_df.sample(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Ticket,Fare,Cabin,Embarked,Boat,Home/Destination
610,0.0,3.0,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1.0,0.0,7546,9.475,,S,,"Sweden Akeley, MN"
853,0.0,3.0,"Harmer, Mr. Abraham (David Lishin)",male,25.0,0.0,0.0,374887,7.25,,S,B,
855,0.0,3.0,"Hassan, Mr. Houssein G N",male,11.0,0.0,0.0,2699,18.7875,,C,,
768,0.0,3.0,"Denkoff, Mr. Mitto",male,,0.0,0.0,349225,7.8958,,S,,"Bulgaria Coon Rapids, IA"
637,0.0,3.0,"Aronsson, Mr. Ernst Axel Algot",male,24.0,0.0,0.0,349911,7.775,,S,,"Sweden Joliet, IL"
378,0.0,2.0,"Collyer, Mr. Harvey",male,31.0,1.0,1.0,C.A. 31921,26.25,,S,,"Bishopstoke, Hants / Fayette Valley, ID"
981,1.0,3.0,"Lundstrom, Mr. Thure Edvin",male,32.0,0.0,0.0,350403,7.5792,,S,15,
33,1.0,1.0,"Bonnell, Miss. Elizabeth",female,58.0,0.0,0.0,113783,26.55,C103,S,8,"Birkdale, England Cleveland, Ohio"
1186,0.0,3.0,"Samaan, Mr. Youssef",male,,2.0,0.0,2662,21.6792,,C,,
1157,0.0,3.0,"Rosblom, Mr. Viktor Richard",male,18.0,1.0,1.0,370129,20.2125,,S,,


In [3]:
titanic_df.shape

(1310, 13)

In [4]:
titanic_df = titanic_df[["Sex", "Survived"]]
titanic_df.head()

Unnamed: 0,Sex,Survived
0,female,1.0
1,male,1.0
2,female,0.0
3,male,0.0
4,female,0.0


In [5]:
titanic_df["Sex"] = titanic_df["Sex"].astype("category", copy=False).cat.codes
titanic_df.head()

Unnamed: 0,Sex,Survived
0,0,1.0
1,1,1.0
2,0,0.0
3,1,0.0
4,0,0.0


In [6]:
titanic_df.isnull().any()

Sex         False
Survived     True
dtype: bool

In [7]:
titanic_df = titanic_df.dropna()
titanic_df.shape

(1309, 2)

In [8]:
features = titanic_df[["Sex", "Survived"]]
label = titanic_df["Survived"]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2)

In [10]:
X_train.shape, y_train.shape

((1047, 2), (1047,))

In [11]:
X_test.shape, y_test.shape

((262, 2), (262,))

In [12]:
survival_num_train = y_train.value_counts()

In [13]:
survival_num_train

0.0    647
1.0    400
Name: Survived, dtype: int64

In [14]:
survival_prob_train = survival_num_train[1] / len(y_train) * 100

In [15]:
survival_prob_train

38.2043935052531

In [16]:
survival_num_test = y_test.value_counts()

In [17]:
survival_num_test

0.0    162
1.0    100
Name: Survived, dtype: int64

In [18]:
survival_prob_test = survival_num_test[1] / len(y_test) * 100

In [19]:
survival_prob_test

38.16793893129771

In [20]:
X_test_men = X_test.loc[X_test["Sex"] == 1]
X_test_men.shape

(170, 2)

In [21]:
X_test_men.head()

Unnamed: 0,Sex,Survived
799,1,0.0
995,1,0.0
404,1,0.0
892,1,0.0
1263,1,0.0


In [22]:
X_test_women = X_test.loc[X_test["Sex"] == 0]
X_test_women.shape

(92, 2)

In [23]:
X_test_women.head()

Unnamed: 0,Sex,Survived
582,0,1.0
624,0,0.0
353,0,1.0
213,0,1.0
780,0,1.0


In [24]:
survival_num_men_test = X_test_men["Survived"].value_counts()
survival_num_men_test

0.0    140
1.0     30
Name: Survived, dtype: int64

In [25]:
survival_prob_men_test = survival_num_men_test[1] / len(X_test_men["Survived"]) * 100
survival_prob_men_test

17.647058823529413

In [26]:
survival_num_women_test = X_test_women["Survived"].value_counts()
survival_num_women_test

1.0    70
0.0    22
Name: Survived, dtype: int64

In [27]:
survival_prob_women_test = (
    survival_num_women_test[1] / len(X_test_women["Survived"]) * 100
)
survival_prob_women_test

76.08695652173914

In [28]:
X_train = X_train.drop("Survived", axis=1)
X_test = X_test.drop("Survived", axis=1)

In [29]:
X_train.shape, X_test.shape

((1047, 1), (262, 1))

In [30]:
model = GaussianNB()

In [31]:
model.fit(X_train, y_train)

GaussianNB()

In [32]:
y_pred = model.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.8015267175572519

In [34]:
X_test["Actual Survived"] = y_test
X_test["Predicted Survived"] = y_pred

In [35]:
X_test.head()

Unnamed: 0,Sex,Actual Survived,Predicted Survived
799,1,0.0,0.0
995,1,0.0,0.0
404,1,0.0,0.0
582,0,1.0,1.0
892,1,0.0,0.0


In [36]:
X_test_men = X_test.loc[X_test["Sex"] == 1]
X_test_women = X_test.loc[X_test["Sex"] == 0]

In [37]:
accuracy_score(X_test_men["Actual Survived"], X_test_men["Predicted Survived"])

0.8235294117647058