# Important Library

In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score , confusion_matrix

# Loading Data Set

In [111]:
data = pd.read_csv("train.csv")

In [112]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [113]:
data.shape

(891, 12)

In [114]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [115]:
data1 = data.drop(['PassengerId','Name','Ticket','Ticket','Cabin'],axis=1)

In [116]:
data1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Converting categorical variable into numerical variable

In [117]:
le = LabelEncoder()

In [118]:
data1['Sex'] = le.fit_transform(data1['Sex'])
data1['Embarked'] = le.fit_transform(data1['Embarked'].astype('str'))

In [119]:
data1.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0


# Handling  missing value

In [120]:
data1['Age'].mean()

29.69911764705882

In [121]:
new_age_var = np.where(data1['Age'],29,data1['Age'])

In [122]:
data1['Age'] = new_age_var

In [123]:
data1.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [124]:
data1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,29.0,1,0,7.25,2
1,1,1,0,29.0,1,0,71.2833,0
2,1,3,0,29.0,0,0,7.925,2
3,1,1,0,29.0,1,0,53.1,2
4,0,3,1,29.0,0,0,8.05,2


# Converting numerical varible into categorical

In [125]:
l = []
for i in data1['Age']:
    if i >30 :
        l.append(1)
    else:
        l.append(0)
        

In [126]:
data1['Age'] = pd.DataFrame(l)

In [127]:
data1['Age'].value_counts()

0    891
Name: Age, dtype: int64

In [128]:
data1['Fare'].mean()

32.204207968574636

In [129]:
l1 = []
for i in data1['Fare']:
    if i >33 :
        l1.append(1)
    else:
        l1.append(0)

In [130]:
data1['Fare'] = pd.DataFrame(l1)

In [131]:
data1['Fare'].value_counts()

0    685
1    206
Name: Fare, dtype: int64

In [132]:
data1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,0,1,0,0,2
1,1,1,0,0,1,0,1,0
2,1,3,0,0,0,0,0,2
3,1,1,0,0,1,0,1,2
4,0,3,1,0,0,0,0,2


# Spliting data into train and test

In [133]:
X = data1.drop('Survived',axis=1)
y = data1['Survived']

In [134]:
X_train,X_test , y_train , y_test = train_test_split(X,y,test_size=.25,random_state=0)

In [135]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(668, 7) (223, 7) (668,) (223,)


# Naive Bayes classification

In [136]:
nb = BernoulliNB()

In [137]:
y_pred = nb.fit(X_train , y_train).predict(X_test)

In [138]:
print("Accuracy : ",accuracy_score(y_test,y_pred,normalize=True))

Accuracy :  0.7488789237668162


In [139]:
print("Confusion_matrix :")
print(confusion_matrix(y_test,y_pred))

Confusion_matrix :
[[104  35]
 [ 21  63]]


In [140]:
#Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked'


# General Function for Naive Bayes Classification

In [141]:
def NaiveBayes(X,y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.25,random_state=0)
    y_pred = nb.fit(X_train , y_train).predict(X_test)
    print("Accuracy : ",accuracy_score(y_test,y_pred,normalize=True))
    print("Confusion_matrix :")
    print(confusion_matrix(y_test,y_pred))
    

In [142]:
# DV -> Pclass 
# IDV -> remaining
X = data1.drop('Pclass',axis=1)
y = data1['Pclass']

In [143]:
NaiveBayes(X,y)

Accuracy :  0.6771300448430493
Confusion_matrix :
[[ 37   0  16]
 [  4  10  32]
 [ 13   7 104]]


In [144]:
# DV ->  Sex
# IDV -> remaining
X = data1.drop('Sex',axis=1)
y = data1['Sex']

In [145]:
NaiveBayes(X,y)

Accuracy :  0.7085201793721974
Confusion_matrix :
[[ 45  38]
 [ 27 113]]


In [146]:
# DV -> Age 
# IDV -> remaining
X = data1.drop('Age',axis=1)
y = data1['Age']

In [147]:
NaiveBayes(X,y)

Accuracy :  1.0
Confusion_matrix :
[[223]]


In [148]:
# DV -> SibSp 
# IDV -> remaining
X = data1.drop('SibSp',axis=1)
y = data1['SibSp']

In [149]:
NaiveBayes(X,y)

Accuracy :  0.6771300448430493
Confusion_matrix :
[[129  21   0   0   0   0]
 [ 34  22   0   0   0   0]
 [  2   3   0   0   0   0]
 [  5   1   0   0   0   0]
 [  2   3   0   0   0   0]
 [  0   1   0   0   0   0]]


In [150]:
# DV -> Parch 
# IDV -> remaining
X = data1.drop('Parch',axis=1)
y = data1['Parch']

In [151]:
NaiveBayes(X,y)

Accuracy :  0.7354260089686099
Confusion_matrix :
[[154  15   2   0   0]
 [ 23  10   0   0   0]
 [ 13   3   0   0   0]
 [  2   0   0   0   0]
 [  1   0   0   0   0]]


In [152]:
# DV -> Fare 
# IDV -> remaining
X = data1.drop('Fare',axis=1)
y = data1['Fare']

In [153]:
NaiveBayes(X,y)

Accuracy :  0.7309417040358744
Confusion_matrix :
[[145  31]
 [ 29  18]]


In [154]:
# DV -> Embarked
# IDV -> remaining
X = data1.drop('Embarked',axis=1)
y = data1['Embarked']

In [155]:
NaiveBayes(X,y)

Accuracy :  0.6905829596412556
Confusion_matrix :
[[  0   0  52]
 [  0   0  17]
 [  0   0 154]]
