# NaiveBayesClassifier

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

## Merge all data into one CSV

In [2]:
# look the files
path = "./Datasets/titanic"
files = [file for file in os.listdir(path) if not file.startswith('.')]

# make a empty data frame
all_data = pd.DataFrame()

# concat each file
for file in files:
    current_data = pd.read_csv(path+"/"+file)
    all_data = pd.concat([all_data, current_data])
    
all_data.to_csv("./Datasets/all_titanic_data.csv", index=False)

## Read updated dataframe

In [3]:
df = pd.read_csv("./Datasets/all_titanic_data.csv")

## Check dimensions

In [4]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,


In [5]:
df.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
1304,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,0.0
1305,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1.0
1306,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,0.0
1307,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,1.0
1308,891,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q,0.0


In [6]:
df.sample(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
1222,805,3,"Hedman, Mr. Oskar Arvid",male,27.0,0,0,347089,6.975,,S,1.0
1063,646,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,PC 17572,76.7292,D33,C,1.0
953,536,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S,1.0
158,1050,1,"Borebank, Mr. John James",male,42.0,0,0,110489,26.55,D22,S,
1229,812,3,"Lester, Mr. James",male,39.0,0,0,A/4 48871,24.15,,S,0.0
1284,867,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,,C,1.0
613,196,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C,1.0
635,218,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,243847,27.0,,S,0.0
225,1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C,
1195,778,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,364516,12.475,,S,1.0


In [7]:
df.shape

(1309, 12)

## Use only sex and survived

In [8]:
df = df[['Sex', 'Survived']]

In [9]:
df.head()

Unnamed: 0,Sex,Survived
0,male,
1,female,
2,male,
3,male,
4,female,


## Change sex to categoric value

In [10]:
df['Sex'] = df['Sex'].astype('category', copy=False).cat.codes

In [11]:
df.head()

Unnamed: 0,Sex,Survived
0,1,
1,0,
2,1,
3,1,
4,0,


## Check for Null values

In [12]:
df.isnull().any()

Sex         False
Survived     True
dtype: bool

### Drop null values

In [13]:
df = df.dropna()

In [14]:
df.shape

(891, 2)

In [15]:
df.reset_index(drop = True)

Unnamed: 0,Sex,Survived
0,1,0.0
1,0,1.0
2,0,1.0
3,0,1.0
4,1,0.0
...,...,...
886,1,0.0
887,0,1.0
888,0,0.0
889,1,1.0


## Machine Learning Model

In [16]:
features = df[['Sex','Survived']]
label = df['Survived']

### Train & Test split

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    label,
                                                    test_size = 0.20,
                                                    random_state = 101)

### Examinate

In [18]:
X_train.shape, X_test.shape

((712, 2), (179, 2))

In [19]:
y_train.shape, y_test.shape

((712,), (179,))

#### Survival Train Stats

In [20]:
survival_num_train = y_train.value_counts()
survival_num_train

0.0    450
1.0    262
Name: Survived, dtype: int64

In [21]:
survival_prob_train = (survival_num_train[1]/ len(y_train))*100
survival_prob_train

36.79775280898877

#### Survival Test Stats

In [22]:
survival_num_test = y_test.value_counts()
survival_num_test

0.0    99
1.0    80
Name: Survived, dtype: int64

In [23]:
survival_prob_test = (survival_num_test[1]/len(y_test))*100
survival_prob_test

44.6927374301676

#### Sex ratio test

In [24]:
x_test_men = X_test.loc[X_test['Sex']==1]
x_test_men.shape

(114, 2)

In [25]:
x_test_women = X_test.loc[X_test['Sex']==0]
x_test_women.shape

(65, 2)

#### Men Survived

In [26]:
survival_men_test = x_test_men['Survived'].value_counts()
survival_men_test

0.0    88
1.0    26
Name: Survived, dtype: int64

In [27]:
survival_prob_men_test = (survival_men_test[1]/len(x_test_men['Survived']))*100
survival_prob_men_test

22.807017543859647

#### Women survived

In [28]:
survival_women_test = x_test_women['Survived'].value_counts()
survival_women_test

1.0    54
0.0    11
Name: Survived, dtype: int64

In [29]:
survival_prob_women_test = (survival_women_test[1]/len(x_test_women['Survived']))*100
survival_prob_women_test

83.07692307692308

## Training model

In [30]:
X_train = X_train.drop('Survived', axis=1)
X_test = X_test.drop('Survived', axis=1)

## GaussianNB

In [31]:
model = GaussianNB()

### Fit

In [32]:
model.fit(X_train, y_train)

GaussianNB()

### Predict

In [33]:
y_pred = model.predict(X_test)

### Accuracy score

In [34]:
accuracy_score(y_test, y_pred)

0.7932960893854749

## Compare values

In [35]:
X_test['Actual Survived'] = y_test
X_test['Predicted Survived'] = y_pred

In [36]:
X_test

Unnamed: 0,Sex,Actual Survived,Predicted Survived
749,1,0.0,0.0
1118,0,1.0,1.0
1166,1,0.0,0.0
1169,1,1.0,0.0
899,1,0.0,0.0
...,...,...,...
572,1,0.0,0.0
872,1,0.0,0.0
1297,0,1.0,1.0
1189,1,0.0,0.0


In [39]:
x_test_men = X_test.loc[X_test['Sex'] == 1]
x_test_women = X_test.loc[X_test['Sex'] == 0]

In [40]:
accuracy_score(x_test_men['Actual Survived'], x_test_men['Predicted Survived'])

0.7719298245614035

In [41]:
accuracy_score(x_test_women['Actual Survived'], x_test_women['Predicted Survived'])

0.8307692307692308