# Niave Bayes Example Notebook

### Notebook describes how to calculate accuracy, split datasets, create a confusion matrix, use sklearn, spark, and perform the calculation by hand with Niave Bayes with different methods.

Table of Contents:
- Read in data
- Perform calcualtion by hand
- Use sklearn
- Use Spark


In [1]:
from io import StringIO
import requests
import json
import pandas as pd



df_data_1 = pd.read_csv('../data/titanic/train.csv')
df_data_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Niave Bayes Model by Hand!
countDict = {}
target = df_data_1['Survived']
count = df_data_1['Survived'].count()
target_countDict = target.value_counts().to_dict()
input_features = df_data_1[['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Survived']]

# Make Model
for val in input_features['Survived'].unique():
    countDict[val]={}
    filterDF = input_features[input_features['Survived'] == val]
    for column in filterDF:
        if column != 'Survived':
            mike = filterDF[column].value_counts().to_dict()
            countDict[val][column] = mike
print("Survived (Target): ", target_countDict)
print("Input Features (Training): ", countDict)


# Predict new value
test = [2, 'female', 2, 3, 'C']
result={}
for val in target_countDict.keys():
    print("\n", val)
    result[val] = target_countDict[val] / count
    print(result[val])
    for i, item in enumerate(test):
        print(val, list(countDict[0].keys())[i]) 
        print(countDict[val][list(countDict[0].keys())[i]][item] / target_countDict[val])
        result[val] = result[val] * (countDict[val][list(countDict[0].keys())[i]][item] / target_countDict[val])
        print(result[val])

# Get max
max = ('', 0)
for r in result:
    max=(r, result[r])
print("\nPrediction:\n","  Input: ", test, "\n   Output: ", max)

Survived (Target):  {0: 549, 1: 342}
Input Features (Training):  {0: {'Pclass': {3: 372, 2: 97, 1: 80}, 'Sex': {'male': 468, 'female': 81}, 'SibSp': {0: 398, 1: 97, 2: 15, 4: 15, 3: 12, 8: 7, 5: 5}, 'Parch': {0: 445, 1: 53, 2: 40, 4: 4, 5: 4, 3: 2, 6: 1}, 'Embarked': {'S': 427, 'C': 75, 'Q': 47}}, 1: {'Pclass': {1: 136, 3: 119, 2: 87}, 'Sex': {'female': 233, 'male': 109}, 'SibSp': {0: 210, 1: 112, 2: 13, 3: 4, 4: 3}, 'Parch': {0: 233, 1: 65, 2: 40, 3: 3, 5: 1}, 'Embarked': {'S': 217, 'C': 93, 'Q': 30}}}

 0
0.6161616161616161
0 Pclass
0.1766848816029144
0.10886644219977554
0 Sex
0.14754098360655737
0.016062261963901307
0 SibSp
0.0273224043715847
0.0004388596165000357
0 Parch
0.0036429872495446266
1.5987599872496747e-06
0 Embarked
0.1366120218579235
2.1840983432372608e-07

 1
0.3838383838383838
1 Pclass
0.2543859649122807
0.09764309764309764
1 Sex
0.6812865497076024
0.06652292909602851
1 SibSp
0.038011695906432746
0.002528649351603423
1 Parch
0.008771929824561403
2.218113466318792e-05
1

In [10]:
import numpy as np
df_data_1['Sex'] = np.where(df_data_1['Sex'] == 'male', 0, 1)
df_data_1['Embarked'] = df_data_1['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df_data_1['Age'] = df_data_1['Age'].fillna(df_data_1['Age'].mean())
print(df_data_1['Embarked'].mode())
df_data_1['Embarked'] = df_data_1['Embarked'].fillna(0)
df_data_1.head()

0    0.0
dtype: float64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0.0


In [11]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix

#Create a Gaussian Classifier
model = GaussianNB()

In [12]:
# Separate into feature set and target
target = df_data_1['Survived'].values
inputData = df_data_1[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values

# Train & Predict (Use training data)
model.fit(inputData, target)
pred = model.predict(inputData)

#Show results
print("Number of mislabeled items out of %d: %d"
      % (inputData.shape[0],(target != pred).sum()))
print("Number of correctly labeled items out of %d: %d"
      % (inputData.shape[0],(target == pred).sum()))
cnf_matrix = confusion_matrix(target, pred)
print("Confusion Matrix:\n",cnf_matrix)

Number of mislabeled items out of 891: 184
Number of correctly labeled items out of 891: 707
Confusion Matrix:
 [[463  86]
 [ 98 244]]


In [6]:
# Randomize and split
df_data_1 = df_data_1.sample(frac=1)
split = int(df_data_1['PassengerId'].count()*0.7)
test = df_data_1.iloc[split:, :]
train = df_data_1.iloc[:split, :]

# Separate into feature set and target
test_target = test['Survived'].values
test_inputData = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values
train_target = train['Survived'].values
train_inputData = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values

# Train & Predict (Use training data)
model.fit(train_inputData, train_target)
pred = model.predict(test_inputData)

# Show results
print("Number of mislabeled items out of %d: %d"
      % (test_inputData.shape[0],(test_target != pred).sum()))
print("Number of correctly labeled items out of %d: %d"
      % (test_inputData.shape[0],(test_target == pred).sum()))
cnf_matrix = confusion_matrix(test_target, pred)
print("Confusion Matrix:\n",cnf_matrix)

Number of mislabeled items out of 268: 57
Number of correctly labeled items out of 268: 211
Confusion Matrix:
 [[132  37]
 [ 20  79]]


In [23]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('MG').getOrCreate()

#df = spark.read.csv("/Users/megildei@us.ibm.com/Downloads/titanic/train.csv")
df_data_2 = spark.read.format("csv").option("header", "true").load("/Users/megildei@us.ibm.com/Downloads/titanic/train.csv")

df_data_2.head(5)

[Row(PassengerId='1', Survived='0', Pclass='3', Name='Braund, Mr. Owen Harris', Sex='male', Age='22', SibSp='1', Parch='0', Ticket='A/5 21171', Fare='7.25', Cabin=None, Embarked='S'),
 Row(PassengerId='2', Survived='1', Pclass='1', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age='38', SibSp='1', Parch='0', Ticket='PC 17599', Fare='71.2833', Cabin='C85', Embarked='C'),
 Row(PassengerId='3', Survived='1', Pclass='3', Name='Heikkinen, Miss. Laina', Sex='female', Age='26', SibSp='0', Parch='0', Ticket='STON/O2. 3101282', Fare='7.925', Cabin=None, Embarked='S'),
 Row(PassengerId='4', Survived='1', Pclass='1', Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age='35', SibSp='1', Parch='0', Ticket='113803', Fare='53.1', Cabin='C123', Embarked='S'),
 Row(PassengerId='5', Survived='0', Pclass='3', Name='Allen, Mr. William Henry', Sex='male', Age='35', SibSp='0', Parch='0', Ticket='373450', Fare='8.05', Cabin=None, Embarked='S')]

In [24]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import *



# Find null/nan values
df_data_2.select([count(when(isnull(c), c)).alias(c) for c in df_data_2.columns]).show()
df_data_2.select([count(when(isnan(c), c)).alias(c) for c in df_data_2.columns]).show()

# Clean up data
data = df_data_2.withColumn('Sex', regexp_replace('Sex', 'male', '0')).withColumn('Sex', regexp_replace('Sex', 'female', '1')).withColumn('Sex', regexp_replace('Sex', 'fe0', '1'))
data = data.withColumn('Embarked', regexp_replace('Embarked', 'Q', '0')).withColumn('Embarked', regexp_replace('Embarked', 'S', '1')).withColumn('Embarked', regexp_replace('Embarked', 'C', '2'))
data = data.fillna({'Age': 29, 'Embarked': 'S'})
data = data.select("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "Survived").rdd
data = data.map(lambda seq: LabeledPoint(seq[-1],seq[:-2]))


# Split training (60%) / test (40%)
training, test = data.randomSplit([0.6, 0.4])

data.take(5)

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|  0|    0|    0|     0|   0|    0|       0|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



[LabeledPoint(0.0, [3.0,0.0,22.0,1.0,0.0]),
 LabeledPoint(1.0, [1.0,1.0,38.0,1.0,0.0]),
 LabeledPoint(1.0, [3.0,1.0,26.0,0.0,0.0]),
 LabeledPoint(1.0, [1.0,1.0,35.0,1.0,0.0]),
 LabeledPoint(0.0, [3.0,0.0,35.0,0.0,0.0])]

In [25]:
# Train
model = NaiveBayes.train(training, 1.0)

# Predict and test
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda x: x[0] == x[1]).count() / test.count()
print('Accuracy: %f' % accuracy)
TP = predictionAndLabel.filter(lambda x: x[0] == 1 and x[1] == 1).count()
FP = predictionAndLabel.filter(lambda x: x[0] == 1 and x[1] == 0).count()
TN = predictionAndLabel.filter(lambda x: x[0] == 0 and x[1] == 0).count()
FN = predictionAndLabel.filter(lambda x: x[0] == 0 and x[1] == 1).count()
print('Confusion Matrix:\n %d  | %d \n %d  | %d' % (TP, FP, FN, TN))

Accuracy: 0.797844
Confusion Matrix:
 97  | 31 
 44  | 199
