# Naive Bayes

In [1]:
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np
penguins = load_dataset('penguins', usecols=['species', 'island'])

In [2]:
penguins.sample(5)

Unnamed: 0,species,island
193,Chinstrap,Dream
111,Adelie,Biscoe
108,Adelie,Biscoe
304,Gentoo,Biscoe
69,Adelie,Torgersen


## Train-Test Split

In [3]:
train, test = train_test_split(penguins, random_state=42)

## Theory

Predict species $s$ based on island $i$.

Ultimately, I want to know $P(s | i)$.

$\rightarrow$ Use Bayes's Theorem!

$\large P(s | i) = \frac{P(s)P(i | s)}{P(i)}$.

## Priors

In [4]:
vc = train['species'].value_counts()
vc

Adelie       112
Gentoo        96
Chinstrap     50
Name: species, dtype: int64

In [5]:
prob_ad = vc['Adelie'] / vc.sum()
prob_gt = vc['Gentoo'] / vc.sum()
prob_cs = vc['Chinstrap'] / vc.sum()

In [6]:
priors = dict(zip(vc.index, [prob_ad, prob_gt, prob_cs]))
priors

{'Adelie': 0.43410852713178294,
 'Gentoo': 0.37209302325581395,
 'Chinstrap': 0.1937984496124031}

## Likelihoods

In [7]:
adelie = train[train['species'] == 'Adelie']
gentoo = train[train['species'] == 'Gentoo']
chinstrap = train[train['species'] == 'Chinstrap']

In [8]:
print(' Adelie:')
print(adelie['island'].value_counts())
print('\n Gentoo:')
print(gentoo['island'].value_counts())
print('\n Chinstrap:')
print(chinstrap['island'].value_counts())

 Adelie:
Dream        43
Torgersen    37
Biscoe       32
Name: island, dtype: int64

 Gentoo:
Biscoe    96
Name: island, dtype: int64

 Chinstrap:
Dream    50
Name: island, dtype: int64


In [9]:
likelihoods = {}

for species in train['species'].unique():
    for island in train['island'].unique():
        spec_group = train[train['species'] == species]
        total = len(spec_group)
        try:
            likelihoods[f"{island}_given_{species}"]\
                = spec_group['island'].value_counts()[island] / total
        except:
            likelihoods[f"{island}_given_{species}"] = 0

likelihoods

{'Torgersen_given_Adelie': 0.33035714285714285,
 'Biscoe_given_Adelie': 0.2857142857142857,
 'Dream_given_Adelie': 0.38392857142857145,
 'Torgersen_given_Gentoo': 0,
 'Biscoe_given_Gentoo': 1.0,
 'Dream_given_Gentoo': 0,
 'Torgersen_given_Chinstrap': 0,
 'Biscoe_given_Chinstrap': 0,
 'Dream_given_Chinstrap': 1.0}

## Probability of Evidence (Island)

In [10]:
scalers = {}

for island in train['island'].unique():
    scalers[island] = np.sum(
        (priors['Adelie'] * likelihoods[f'{island}_given_Adelie'],
        priors['Gentoo'] * likelihoods[f'{island}_given_Gentoo'],
        priors['Chinstrap'] * likelihoods[f'{island}_given_Chinstrap'])
    )

In [11]:
scalers

{'Torgersen': 0.1434108527131783,
 'Biscoe': 0.49612403100775193,
 'Dream': 0.3604651162790698}

## Posteriors

In [12]:
ad_given_bis =\
    priors['Adelie'] * likelihoods['Biscoe_given_Adelie']
gt_given_bis =\
    priors['Gentoo'] * likelihoods['Biscoe_given_Gentoo']
cs_given_bis =\
    priors['Chinstrap'] * likelihoods['Biscoe_given_Chinstrap']
posts_bis =\
    [ad_given_bis, gt_given_bis, cs_given_bis] / scalers['Biscoe']

In [13]:
ad_given_drm =\
    priors['Adelie'] * likelihoods['Dream_given_Adelie']
gt_given_drm =\
    priors['Gentoo'] * likelihoods['Dream_given_Gentoo']
cs_given_drm =\
    priors['Chinstrap'] * likelihoods['Dream_given_Chinstrap']
posts_drm =\
    [ad_given_drm, gt_given_drm, cs_given_drm] / scalers['Dream']

In [14]:
ad_given_tor =\
    priors['Adelie'] * likelihoods['Torgersen_given_Adelie']
gt_given_tor =\
    priors['Gentoo'] * likelihoods['Torgersen_given_Gentoo']
cs_given_tor =\
    priors['Chinstrap'] * likelihoods['Torgersen_given_Chinstrap']
posts_tor =\
    [ad_given_tor, gt_given_tor, cs_given_tor] / scalers['Torgersen']

## Prediction

### Input: Biscoe

Input is: Biscoe. What do we predict?

Ans.: Compare
- P(Adelie | Biscoe)
- P(Gentoo | Biscoe)
- P(Chinstrap | Biscoe)

In [15]:
print(f"""probs:
Adelie: {posts_bis[0]},
Gentoo: {posts_bis[1]},
Chinstrap: {posts_bis[2]}
""")

probs:
Adelie: 0.25,
Gentoo: 0.75,
Chinstrap: 0.0



So we'll predict Gentoo!

### Input: Dream

Input is: Dream. What do we predict?

Ans.: Compare
- P(Adelie | Dream),
- P(Gentoo | Dream),
- P(Chinstrap | Dream)

In [16]:
print(f"""probs:
Adelie: {posts_drm[0]},
Gentoo: {posts_drm[1]},
Chinstrap: {posts_drm[2]}
""")

probs:
Adelie: 0.4623655913978495,
Gentoo: 0.0,
Chinstrap: 0.5376344086021505



So we'll predict Chinstrap!

### Input: Torgersen

Input is: Torgersen. What do we predict?

Ans.: Compare
- P(Adelie | Torgersen)
- P(Gentoo | Torgersen)
- P(Chinstrap | Torgersen)

In [17]:
print(f"""probs:
Adelie: {posts_tor[0]},
Gentoo: {posts_tor[1]},
Chinstrap: {posts_tor[2]}
""")

probs:
Adelie: 1.0,
Gentoo: 0.0,
Chinstrap: 0.0



So we'll predict Adelie!

## Evaluate

In [18]:
train_copy = train.copy()
train_copy['preds'] = train_copy['island'].map({'Biscoe': 'Gentoo',
                                     'Dream': 'Chinstrap',
                                     'Torgersen': 'Adelie'})
train_copy['correct'] = train_copy['species'] == train_copy['preds']

print(f"""Training Accuracy:
{train_copy['correct'].sum() / train_copy.shape[0]}
""")

Training Accuracy:
0.7093023255813954



In [19]:
test_copy = test.copy()
test_copy['preds'] = test_copy['island'].map({'Biscoe': 'Gentoo',
                                    'Dream': 'Chinstrap',
                                    'Torgersen': 'Adelie'})

test_copy['correct'] = test_copy['species'] == test_copy['preds']

print(f"""Testing Accuracy:
{test_copy['correct'].sum() / test_copy.shape[0]}
""")

Testing Accuracy:
0.7093023255813954



## Faster Way / Verify Bayes's Theorem

Couldn't we just calculate the posteriors directly?

Yes!

In [20]:
posteriors = {}

for island in train['island'].unique():
    for species in train['species'].unique():
        isl_group = train[train['island'] == island]
        total = len(isl_group)
        try:
            posteriors[f"{species}_given_{island}"]\
                = isl_group['species'].value_counts()[species] / total
        except:
            posteriors[f"{species}_given_{island}"] = 0

posteriors

{'Adelie_given_Torgersen': 1.0,
 'Gentoo_given_Torgersen': 0,
 'Chinstrap_given_Torgersen': 0,
 'Adelie_given_Biscoe': 0.25,
 'Gentoo_given_Biscoe': 0.75,
 'Chinstrap_given_Biscoe': 0,
 'Adelie_given_Dream': 0.46236559139784944,
 'Gentoo_given_Dream': 0,
 'Chinstrap_given_Dream': 0.5376344086021505}