# Naive Bayes Classifier

> Naive because the "naive" assumption that each feature is independent

- https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

In [63]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pprint import pprint

# typing
from typing import List, Dict, Tuple
import numpy.typing as npt

sns.set()
plt.rcParams["figure.figsize"] = (10, 4)

## Sample Data

> Generate sample data for the algorithm


In [64]:
data = dict(
    chills = ['Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y'],
    runny_nose = ['N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N'],
    headache = ['Mild', 'No', 'Strong', 'Mild', 'No', 'Strong', 'Strong', 'Mild', 'Mild'],
    fever = ['Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N'],
    have_flu = ['No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', None]
)
df = pd.DataFrame(data).astype({col: "category" for col in data.keys()})

test = df.iloc[-1:, :] # to index to a dataframe
train = df.iloc[:-1, :]
train.shape, test.shape

((8, 5), (1, 5))

In [65]:
train

Unnamed: 0,chills,runny_nose,headache,fever,have_flu
0,Y,N,Mild,Y,No
1,Y,Y,No,N,Yes
2,Y,N,Strong,Y,Yes
3,N,Y,Mild,Y,Yes
4,N,N,No,N,No
5,N,Y,Strong,Y,Yes
6,N,Y,Strong,N,No
7,Y,Y,Mild,Y,Yes


In [66]:
test

Unnamed: 0,chills,runny_nose,headache,fever,have_flu
8,Y,N,Mild,N,


## Modelling with Naive Bayes

- Using `scikit-learn` function
- Implementing with Python from Scratch
- ...

### NaiveBayesClassifier

- https://scikit-learn.org/stable/modules/naive_bayes.html

### From Scratch

In [67]:
# 1. initialize probability dict to store the conditional probabilities
probs_dict = dict()

for target in train['have_flu'].cat.categories: 
    probs_dict[target] = dict()
    
    for col in train.columns: 
        probs_dict[target][col] = {key:None for key in df[col].cat.categories}
        
pprint(probs_dict)

{'No': {'chills': {'N': None, 'Y': None},
        'fever': {'N': None, 'Y': None},
        'have_flu': {'No': None, 'Yes': None},
        'headache': {'Mild': None, 'No': None, 'Strong': None},
        'runny_nose': {'N': None, 'Y': None}},
 'Yes': {'chills': {'N': None, 'Y': None},
         'fever': {'N': None, 'Y': None},
         'have_flu': {'No': None, 'Yes': None},
         'headache': {'Mild': None, 'No': None, 'Strong': None},
         'runny_nose': {'N': None, 'Y': None}}}


In [68]:
# 2. compute prior probabilities
# i.e. probabilities on the target before "seeing" the data

for target in train['have_flu'].cat.categories: 
    probs_dict[target]['have_flu'] = (train.have_flu == target).sum() / train.shape[0]

pprint(probs_dict)

{'No': {'chills': {'N': None, 'Y': None},
        'fever': {'N': None, 'Y': None},
        'have_flu': 0.375,
        'headache': {'Mild': None, 'No': None, 'Strong': None},
        'runny_nose': {'N': None, 'Y': None}},
 'Yes': {'chills': {'N': None, 'Y': None},
         'fever': {'N': None, 'Y': None},
         'have_flu': 0.625,
         'headache': {'Mild': None, 'No': None, 'Strong': None},
         'runny_nose': {'N': None, 'Y': None}}}


In [69]:
# on each class/label, compute posterior probablity

for target in df['have_flu'].cat.categories: 
    tmp_probs = probs_dict[target]
    tmp_df = df[df['have_flu'] == target]
    
    # on each attribute/column/feature, exclude the target
    for col in df.columns.drop('have_flu'):
        
        # on each category of the attribute
        for code, category in enumerate(df[col].cat.categories):
            # indexing the list by order, save the conditional probability p(xi | Ci)
            tmp_probs[col][category] = (tmp_df[col] == category).sum() / tmp_df.shape[0]

pprint(probs_dict)

{'No': {'chills': {'N': 0.6666666666666666, 'Y': 0.3333333333333333},
        'fever': {'N': 0.6666666666666666, 'Y': 0.3333333333333333},
        'have_flu': 0.375,
        'headache': {'Mild': 0.3333333333333333,
                     'No': 0.3333333333333333,
                     'Strong': 0.3333333333333333},
        'runny_nose': {'N': 0.6666666666666666, 'Y': 0.3333333333333333}},
 'Yes': {'chills': {'N': 0.4, 'Y': 0.6},
         'fever': {'N': 0.2, 'Y': 0.8},
         'have_flu': 0.625,
         'headache': {'Mild': 0.4, 'No': 0.2, 'Strong': 0.4},
         'runny_nose': {'N': 0.2, 'Y': 0.8}}}


In [70]:
# with the trained model, compute probability for the test case
for target in df['have_flu'].cat.categories: 
    tmp_probs = probs_dict[target]
    prob = 1
    # get the likelihood multiplcations
    for col in test.columns.drop('have_flu'):
        prob *= tmp_probs[col][test[col].values[0]]
    # multiply prior probability
    prob *=  tmp_probs['have_flu']

    pprint(target + ': ' + str(prob))

'No: 0.018518518518518517'
'Yes: 0.006'


### From Scratch with collections

In [71]:
from collections import Counter, defaultdict

In [72]:
target = train.have_flu.values
target

['No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
Categories (2, object): ['No', 'Yes']

In [73]:
data = train.drop("have_flu", axis=1).values
data

array([['Y', 'N', 'Mild', 'Y'],
       ['Y', 'Y', 'No', 'N'],
       ['Y', 'N', 'Strong', 'Y'],
       ['N', 'Y', 'Mild', 'Y'],
       ['N', 'N', 'No', 'N'],
       ['N', 'Y', 'Strong', 'Y'],
       ['N', 'Y', 'Strong', 'N'],
       ['Y', 'Y', 'Mild', 'Y']], dtype=object)

In [74]:
def occurrences(outcome: List[int]) -> float:
    """Calculate occurent of each value in the list"""
    no_of_examples = len(outcome)
    prob = dict(Counter(outcome)) # get the count of each class
    for key in prob.keys():
        
        prob[key] = prob[key] / float(no_of_examples)
        # updated by proportion/ probability
    return prob

occurrences(target)

{'No': 0.375, 'Yes': 0.625}

In [75]:
classes = np.unique(train.have_flu) #array([0, 1])
rows, cols = np.shape(data)

likelihoods = {}
#initializing the dictionary
for cls in classes:
    likelihoods[cls] = defaultdict(list)

for cls in classes:
    #taking samples of only 1 class at a time
    row_indices = np.where(target == cls)[0]
    subset      = data[row_indices, :]
    r, c        = np.shape(subset)
    for j in range(0, c):
        likelihoods[cls][j] += list(subset[:,j])
        
pprint(likelihoods)

{'No': defaultdict(<class 'list'>,
                   {0: ['Y', 'N', 'N'],
                    1: ['N', 'N', 'Y'],
                    2: ['Mild', 'No', 'Strong'],
                    3: ['Y', 'N', 'N']}),
 'Yes': defaultdict(<class 'list'>,
                    {0: ['Y', 'Y', 'N', 'N', 'Y'],
                     1: ['Y', 'N', 'Y', 'Y', 'Y'],
                     2: ['No', 'Strong', 'Mild', 'Strong', 'Mild'],
                     3: ['N', 'Y', 'Y', 'Y', 'Y']})}


In [76]:
for cls in classes:
    for j in range(0, c):
        # compute conditional probability 
        likelihoods[cls][j] = occurrences(likelihoods[cls][j])
        
pprint(likelihoods)

{'No': defaultdict(<class 'list'>,
                   {0: {'N': 0.6666666666666666, 'Y': 0.3333333333333333},
                    1: {'N': 0.6666666666666666, 'Y': 0.3333333333333333},
                    2: {'Mild': 0.3333333333333333,
                        'No': 0.3333333333333333,
                        'Strong': 0.3333333333333333},
                    3: {'N': 0.6666666666666666, 'Y': 0.3333333333333333}}),
 'Yes': defaultdict(<class 'list'>,
                    {0: {'N': 0.4, 'Y': 0.6},
                     1: {'N': 0.2, 'Y': 0.8},
                     2: {'Mild': 0.4, 'No': 0.2, 'Strong': 0.4},
                     3: {'N': 0.2, 'Y': 0.8}})}


In [77]:
class_probabilities = occurrences(target)
test_array = test.drop("have_flu", axis=1).values[0]

prediction = {}
for cls in classes:
    class_probability = class_probabilities[cls]
    for i in range(0,len(test_array)):
        relative_feature_values = likelihoods[cls][i]
        if test_array[i] in relative_feature_values.keys():
            class_probability *= relative_feature_values[test_array[i]]
        else:
            class_probability *= 0
        prediction[cls] = class_probability

prediction

{'No': 0.018518518518518517, 'Yes': 0.006000000000000002}

## Sample Data with Continuous Variables

In [78]:
# Create an empty dataframe
df = pd.DataFrame()

# Create our target variable
df['gender'] = ['male','male','male','male','female','female','female','female']

# Create our feature variables
df['height'] = [6,5.92,5.58,5.92,5,5.5,5.42,5.75]
df['weight'] = [180,190,170,165,100,150,130,150]
df['foot_size'] = [12,11,12,10,6,8,7,9]

df.gender = df.gender.astype('category')

df

Unnamed: 0,gender,height,weight,foot_size
0,male,6.0,180,12
1,male,5.92,190,11
2,male,5.58,170,12
3,male,5.92,165,10
4,female,5.0,100,6
5,female,5.5,150,8
6,female,5.42,130,7
7,female,5.75,150,9


In [79]:
# Create an empty dataframe
test = pd.DataFrame()

# Create some feature values for this single row
test['height'] = [6]
test['weight'] = [130]
test['foot_size'] = [8]

# View the data 
test

Unnamed: 0,height,weight,foot_size
0,6,130,8


In [80]:
def gaussian_func(x, mu, sigma):
    return (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-((x-mu)**2) / (2*sigma**2))

# 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))

## Modelling with Naive Bayes

### From Scratch

In [81]:
# initialize dict to store the mean and standard deviation
probs_dict = dict()

for target in df['gender'].cat.categories: 
    probs_dict[target] = dict()
    for col in df.columns: 
        probs_dict[target][col] = [None, None]
        
pprint(probs_dict)

{'female': {'foot_size': [None, None],
            'gender': [None, None],
            'height': [None, None],
            'weight': [None, None]},
 'male': {'foot_size': [None, None],
          'gender': [None, None],
          'height': [None, None],
          'weight': [None, None]}}


In [82]:
# compute prior probabilities
for target in df['gender'].cat.categories: 
    probs_dict[target]['gender'] = (df.gender == target).sum() / df.shape[0]
pprint(probs_dict)

{'female': {'foot_size': [None, None],
            'gender': 0.5,
            'height': [None, None],
            'weight': [None, None]},
 'male': {'foot_size': [None, None],
          'gender': 0.5,
          'height': [None, None],
          'weight': [None, None]}}


In [83]:
# on each class/label 
for target in df['gender'].cat.categories: 
    tmp_probs = probs_dict[target]
    tmp_df = df[df['gender'] == target]
    # on each attribute/column/feature, exclude the target
    for col in df.columns.drop('gender'):
        # first item is mean
        tmp_probs[col][0] = tmp_df[col].mean()
        # second item is std
        tmp_probs[col][1] = tmp_df[col].std()
pprint(probs_dict)

{'female': {'foot_size': [7.5, 1.2909944487358056],
            'gender': 0.5,
            'height': [5.4175, 0.3118092365533773],
            'weight': [132.5, 23.629078131263043]},
 'male': {'foot_size': [11.25, 0.9574271077563381],
          'gender': 0.5,
          'height': [5.855, 0.18717193521821937],
          'weight': [176.25, 11.086778913041726]}}


In [84]:
for target in df['gender'].cat.categories: 
    tmp_probs = probs_dict[target]
    prob = 1
    # get the likelihood multiplcations
#     for col in test.columns.drop('gender'):
    for col in test.columns:
        prob *= gaussian_func(test[col].values[0], *tmp_probs[col])
    # multiply prior probability
    prob *=  tmp_probs['gender']
    print(target + ': ' + str(prob))

female: 0.0005377909183630024
male: 6.197071843878095e-09
