<a href="https://colab.research.google.com/github/garya171/pmlbe/blob/main/ch_2_part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Standard Python Modules  

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
%matplotlib inline

# Import Extra Python Modules  

In [2]:
from collections import defaultdict

# Data Loading

In [3]:
X_train = np.array([
    [0, 1, 1],
    [0, 0, 1],
    [0, 0, 0],
    [1, 1, 0]])
Y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1, 0, 0]])

# Data Preprossing and Feature Engineering

## Scan the Data

## Visualize the Data

## Dealing with Missing Data

### Imputing Data such as Mean, Medium, Mode

## Label Encoding

In [4]:
def get_label_indices(labels):
    """
    Group samples based on their labels and return indices
    @param labels: list of labels
    @return: dict, {class1: [indices], class2: [indices]}
    """
    label_indices = defaultdict(list)
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [5]:
label_indices = get_label_indices(Y_train)
print('label_indices:\n', label_indices)

label_indices:
 defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})


### One-hot Encoding, Sparce Matrix

### Dense Embedding

## Scaling

### Gaussian: mean = 0, variance = 1

### Interquartile Range

## Feature Engineering

### Polynomial Transformation

### Binning

## Combing Models

### Voting and Averaging

### Bagging / Bootstrap Aggregating

### Boosting

### Stacking

# Create and Run the Engine

In [23]:
def get_prior(label_indices):
    """
    Compute prior percent based on training samples
    @param label_indices: grouped sample indices by class
    @return: dictionary, with class label as key, corresponding prior percent
    as the value
    """
    prior = {label: len(indices) for label, indices in label_indices.items()}
    print('prior:', prior)
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count   # calc pct of total for each label
    return prior

In [24]:
prior = get_prior(label_indices)
print('Prior:', prior)

prior: {'Y': 3, 'N': 1}
Prior: {'Y': 0.75, 'N': 0.25}


In [25]:
def get_likelihood(features, label_indices, smoothing=0):
    """
    Compute likelihood based on training samples
    @param features: matrix of features
    @param label_indices: grouped sample indices by class
    @param smoothing: integer, additive smoothing parameter
    @return: dictionary, with class as key, corresponding conditional probability
             P(feature|class) vector as value
    """
    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis=0) + smoothing
        print('likelihood[label]:', likelihood[label])
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
    return likelihood

In [26]:
print('label_indices:\n', label_indices)
smoothing = 1
likelihood = get_likelihood(X_train, label_indices, smoothing)
print('Likelihood:\n', likelihood)

label_indices:
 defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})
likelihood[label]: [2 3 2]
likelihood[label]: [1 1 2]
Likelihood:
 {'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}


In [27]:
def get_posterior(X, prior, likelihood):
    """
    Compute posterior of testing samples, based on prior and likelihood
    @param X: testing samples
    @param prior: dictionary, with class label as key, corresponding prior as the value
    @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value
    @return: dictionary, with class label as key, corresponding posterior as value
    """
    posteriors = []
    for x in X:
        # posterior is proportional to prior * likelihood
        posterior = prior.copy()
        for label, likelihood_label in likelihood.items():
            for index, bool_value in enumerate(x):
                posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])
        # normalize so that all sums up to 1
        sum_posterior = sum(posterior.values())
        for label in posterior:
            if posterior[label] == float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

In [28]:
posterior = get_posterior(X_test, prior, likelihood)
print('Posterior:\n', posterior)

Posterior:
 [{'Y': 0.795417348608838, 'N': 0.20458265139116205}]


In [31]:
####
#### Implementing Naïve Bayes with scikit-learn
####

from sklearn.naive_bayes import BernoulliNB

In [32]:
clf = BernoulliNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

In [33]:
pred_prob = clf.predict_proba(X_test)
print('[scikit-learn] Predicted probabilities:\n', pred_prob)

[scikit-learn] Predicted probabilities:
 [[0.20458265 0.79541735]]


In [34]:
pred = clf.predict(X_test)
print('[scikit-learn] Prediction:', pred)

[scikit-learn] Prediction: ['Y']
