In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as skdata
import scipy

# Helper Functions


In [2]:
def load_iris():
    D, L = skdata.load_iris()["data"].T, skdata.load_iris()["target"]
    return D, L


def split_db_2to1(D, L, seed=0):
    nTrain = int(D.shape[1] * 2.0 / 3.0)
    np.random.seed(seed)
    idx = np.random.permutation(D.shape[1])
    idxTrain = idx[0:nTrain]
    idxTest = idx[nTrain:]
    DTR = D[:, idxTrain]
    DTE = D[:, idxTest]
    LTR = L[idxTrain]
    LTE = L[idxTest]
    return (DTR, LTR), (DTE, LTE)


def logpdf_GAU_ND(X, mu, C):
    X = np.atleast_2d(X)
    mu = np.atleast_1d(mu)
    C = np.atleast_2d(C)

    return -0.5 * (
        X.shape[0] * np.log(2 * np.pi)
        + np.linalg.slogdet(C)[1]
        + np.einsum("ij,ji->i", np.dot((X - mu).T, np.linalg.inv(C)), (X - mu))
    )


def log_likelihood_GAU_ND(X, mu, C):
    return np.sum(logpdf_GAU_ND(X, mu, C))

# Load all the testing data


In [3]:
llr_MVG = np.load("solutions/llr_MVG.npy")
logMarginal_MVG = np.load("solutions/logMarginal_MVG.npy")
logMarginal_NaiveBayes = np.load("solutions/logMarginal_NaiveBayes.npy")
logMarginal_TiedMVG = np.load("solutions/logMarginal_TiedMVG.npy")
logPosterior_MVG = np.load("solutions/logPosterior_MVG.npy")
logPosterior_NaiveBayes = np.load("solutions/logPosterior_NaiveBayes.npy")
logPosterior_TiedMVG = np.load("solutions/logPosterior_TiedMVG.npy")
logSJoint_MVG = np.load("solutions/logSJoint_MVG.npy")
logSJoint_NaiveBayes = np.load("solutions/logSJoint_NaiveBayes.npy")
logSJoint_TiedMVG = np.load("solutions/logSJoint_TiedMVG.npy")
Posterior_MVG = np.load("solutions/Posterior_MVG.npy")
Posterior_NaiveBayes = np.load("solutions/Posterior_NaiveBayes.npy")
Posterior_TiedMVG = np.load("solutions/Posterior_TiedMVG.npy")
SJoint_MVG = np.load("solutions/SJoint_MVG.npy")
SJoint_NaiveBayes = np.load("solutions/SJoint_NaiveBayes.npy")
SJoint_TiedMVG = np.load("solutions/SJoint_TiedMVG.npy")

In [4]:
D, L = load_iris()
# DTR and LTR are training data and labels, DTE and LTE are evaluation data and labels
(DTR, LTR), (DTE, LTE) = split_db_2to1(D, L)

# Multivariate Gaussian Classifier

We compute the ML estimates for the classifier parameters $(\mu_0, \Sigma_0), (\mu_1, \Sigma_1), (\mu_2, \Sigma_2)$ using the training data. We then compute the likelihoods:

$f_{\boldsymbol{X} \mid C}\left(\boldsymbol{x}_t \mid c\right)=\mathcal{N}\left(\boldsymbol{x}_t \mid \boldsymbol{\mu}_c^*, \boldsymbol{\Sigma}_c^*\right)$


In [11]:
classes = np.unique(LTE)
S = np.zeros((len(classes), DTE.shape[1]))

print(DTE.shape)

for i, label in enumerate(classes):
    mu = np.mean(DTR[:, LTR == label], axis=1, keepdims=True)
    C = np.cov(DTR[:, LTR == label], bias=True)
    print(f"Class {i}: mu = {mu}, C = {C}")

    log_pdf = logpdf_GAU_ND(DTE, mu, C)

    likelihood = np.exp(log_pdf)

    print(f"Likelihoods of class {i} = {likelihood}")

    S[i, :] = likelihood

(4, 50)
Class 0: mu = [[4.96129032]
 [3.42903226]
 [1.46451613]
 [0.2483871 ]], C = [[0.13140479 0.11370447 0.02862643 0.01187305]
 [0.11370447 0.16270552 0.01844953 0.01117586]
 [0.02862643 0.01844953 0.03583767 0.00526535]
 [0.01187305 0.01117586 0.00526535 0.0108845 ]]
Likelihoods of class 0 = [4.75727828e+000 3.12730541e+000 8.18872691e-062 6.74418077e-182
 4.25115977e-209 1.11347892e+001 8.20663072e+000 5.33350172e+000
 5.65483426e-075 4.14843994e-073 2.02642087e+000 3.77771958e-001
 9.83889710e-067 7.82296707e-001 1.46007373e-245 4.78978412e-034
 1.84384639e-183 1.67069032e-072 4.56056472e-001 4.42425541e-106
 1.72117487e-002 8.08671263e-213 8.79017787e+000 6.13458968e+000
 1.33960789e-172 1.07167476e+001 1.80360838e-176 5.20856446e-061
 3.67983764e-052 9.13121973e-037 1.77931741e-155 5.64070816e-152
 1.69675933e-099 1.26208799e-107 2.40102284e+000 3.49099515e-043
 7.08090459e-176 3.20056873e-127 1.08532794e+001 1.51993171e-077
 2.16083562e-060 1.00671519e-105 3.56156641e-076 2.2

## Matrix of Joint Densities

$f_{\boldsymbol{X}, C}\left(\boldsymbol{x}_t, c\right)=f_{\boldsymbol{X} \mid C}\left(\boldsymbol{x}_t \mid c\right) P_C(c)$


In [47]:
# Multiply each row by the prior (1/3)
SJoint = S * 1 / 3

assert np.allclose(SJoint, SJoint_MVG)

[4.75727828e+000 3.12730541e+000 8.18872691e-062 6.74418077e-182
 4.25115977e-209 1.11347892e+001 8.20663072e+000 5.33350172e+000
 5.65483426e-075 4.14843994e-073 2.02642087e+000 3.77771958e-001
 9.83889710e-067 7.82296707e-001 1.46007373e-245 4.78978412e-034
 1.84384639e-183 1.67069032e-072 4.56056472e-001 4.42425541e-106
 1.72117487e-002 8.08671263e-213 8.79017787e+000 6.13458968e+000
 1.33960789e-172 1.07167476e+001 1.80360838e-176 5.20856446e-061
 3.67983764e-052 9.13121973e-037 1.77931741e-155 5.64070816e-152
 1.69675933e-099 1.26208799e-107 2.40102284e+000 3.49099515e-043
 7.08090459e-176 3.20056873e-127 1.08532794e+001 1.51993171e-077
 2.16083562e-060 1.00671519e-105 3.56156641e-076 2.27780042e-001
 4.92487274e+000 3.15819965e+000 3.71452449e-141 6.06520927e-053
 3.16498493e-212 9.29266297e+000]
--------------
[1.58575943e+000 1.04243514e+000 2.72957564e-062 2.24806026e-182
 1.41705326e-209 3.71159641e+000 2.73554357e+000 1.77783391e+000
 1.88494475e-075 1.38281331e-073 6.754736

## Class Posterior Probabilities

$P\left(C=c \mid \boldsymbol{X}=\boldsymbol{x}_t\right)=\frac{f_{\boldsymbol{X}, C}\left(\boldsymbol{x}_t, c\right)}{\sum_{c^{\prime}} f_{\boldsymbol{X}, C}\left(\boldsymbol{x}_{\boldsymbol{t}}, c^{\prime}\right)}$


In [48]:
SMarginal = np.sum(SJoint, axis=0)
SPost = SJoint / SMarginal

assert np.allclose(SPost, Posterior_MVG)

In [49]:
predictions = np.argmax(SPost, axis=0)

print(f"Predictions: {predictions}")

Predictions: [0 0 1 2 2 0 0 0 1 1 0 0 1 0 2 1 2 1 0 2 0 2 0 0 2 0 2 1 1 1 2 2 2 1 0 1 2
 2 0 1 1 2 1 0 0 0 2 1 2 0]


### Accuracy


In [52]:
acc = np.mean(predictions == LTE)

print(f"Accuracy: {acc}")

Accuracy: 0.96


### Error Rate


In [53]:
err = 1 - acc

print(f"Error rate: {err}")

Error rate: 0.040000000000000036


## Joint Log Density

$l_c=\log f_{\boldsymbol{X}, C}\left(\boldsymbol{x}_t, c\right)=\log f_{\boldsymbol{X} \mid C}\left(\boldsymbol{x}_t \mid c\right)+\log P_C(c)$


In [55]:
logSJoint = np.log(SJoint)

assert np.allclose(logSJoint, logSJoint_MVG)

## Log Marginal Density

### log-sum-exp trick

$\log f_{\boldsymbol{X}}\left(\boldsymbol{x}_t\right)=\log \sum_c e^{l_c}$

Computing the exponential might result in numerical errors, so we rewrite the sum as :

$\log \sum_c e^{l_c}=l+\log \sum_c e^{l_c-l}$

where $l=\max_c l_c$


In [58]:
logSMarginal = scipy.special.logsumexp(logSJoint, axis=0)

assert np.allclose(logSMarginal, logMarginal_MVG)

## Log Posterior

$\log P\left(C=c \mid \boldsymbol{X}=\boldsymbol{x}_t\right)=\log f_{\boldsymbol{X}, C}\left(\boldsymbol{x}_t, c\right)-\log f_{\boldsymbol{X}}\left(\boldsymbol{x}_t\right)$


In [60]:
logSPost = logSJoint - logSMarginal

assert np.allclose(logSPost, logPosterior_MVG)

SPost = np.exp(logSPost)

assert np.allclose(SPost, Posterior_MVG)

# Naive Bayes Gaussian Classifier

The Naive Bayes version of the Multi Variate Gaussian is simply a Gaussian classifier where the covariance matrices are diagonal. The ML solution for the covariance matrices, therefore, is:

$\operatorname{diag}\left(\boldsymbol{\Sigma}_c^*\right)=\operatorname{diag}\left[\frac{1}{N_c} \sum_i\left(\boldsymbol{x}_{c, i}-\boldsymbol{\mu}_c^*\right)\left(\boldsymbol{x}_{c, i}-\boldsymbol{\mu}_c^*\right)^T\right]$


In [65]:
for i, label in enumerate(classes):
    mu = np.mean(DTR[:, LTR == label], axis=1, keepdims=True)
    C = np.cov(DTR[:, LTR == label], bias=True)

    # Make the covariance matrix diagonal
    C = np.diag(np.diag(C))

    log_pdf = logpdf_GAU_ND(DTE, mu, C)
    likelihood = np.exp(log_pdf)
    S[i, :] = likelihood

# Multiply each row by the prior (1/3)
SJoint = S * 1 / 3

assert np.allclose(SJoint, SJoint_NaiveBayes)

# Compute the marginal likelihood
SMarginal = np.sum(SJoint, axis=0)

assert np.allclose(SMarginal, np.exp(logMarginal_NaiveBayes))

# Compute the posterior
SPost = SJoint / SMarginal

assert np.allclose(SPost, Posterior_NaiveBayes)

# Compute the predictions
predictions = np.argmax(SPost, axis=0)
accuracy = np.mean(predictions == LTE)
error_rate = 1 - accuracy

print(f"Accuracy: {accuracy}")
print(f"Error rate: {error_rate}")

Accuracy: 0.96
Error rate: 0.040000000000000036


# Tied Covariance Gaussian Classifier

Similar to the previous classifier, the only ML estimate that changes is the covariance matrix. The ML estimate for the covariance matrix is:

$\boldsymbol{\Sigma}^*=\frac{1}{N} \sum_c \sum_i\left(\boldsymbol{x}_{c, i}-\boldsymbol{\mu}_c^*\right)\left(\boldsymbol{x}_{c, i}-\boldsymbol{\mu}_c^*\right)^T$

(We notice that this is the within class scatter matrix)


In [83]:
weights = np.array([len(DTR[0, LTR == c]) for c in classes])

Sw = np.average(
    [np.atleast_2d(np.cov(DTR[:, LTR == c], bias=True)) for c in classes],
    axis=0,
    weights=weights,
)

for i, label in enumerate(classes):
    D = DTR[:, LTR == label]
    mu = np.mean(D, axis=1, keepdims=True)

    log_pdf = logpdf_GAU_ND(DTE, mu, Sw)
    likelihood = np.exp(log_pdf)
    S[i, :] = likelihood

# Multiply each row by the prior (1/3)
SJoint = S * 1 / 3

assert np.allclose(SJoint, SJoint_TiedMVG)

# Compute the marginal likelihood
SMarginal = np.sum(SJoint, axis=0)

assert np.allclose(SMarginal, np.exp(logMarginal_TiedMVG))

# Compute the posterior
SPost = SJoint / SMarginal

assert np.allclose(SPost, Posterior_TiedMVG)

# Compute the predictions
predictions = np.argmax(SPost, axis=0)
accuracy = np.mean(predictions == LTE)
error_rate = 1 - accuracy

print(f"Accuracy: {accuracy}")
print(f"Error rate: {error_rate}")

Accuracy: 0.98
Error rate: 0.020000000000000018


# Binary tasks: log-likelihood Ratios and MVG

We can proceed in the same way but for binary tasks we can cast the classification as a comparison of a score, the _log-likelihood ratio_ with a threshold _t_ that depends on class priors.

Assuming that class 2 is the _true_ class and class 1 is the _false_ class, the log-likelihood ratio is:

$s(x_t) = llr(x_t) = \log \frac{f_{\boldsymbol{X}|C}(x_t | 2)}{f_{\boldsymbol{X}|C}(x_t | 1)} = \log \frac{\mathcal{N}(x_t | {\mu}_2, {\Sigma}_2)}{\mathcal{N}(x_t | {\mu}_1, {\Sigma}_1)} = \log \mathcal{N}(x_t | {\mu}_2, {\Sigma}_2) - \log \mathcal{N}(x_t | {\mu}_1, {\Sigma}_1)$


In [107]:
# Restrict the dataset to class 1 and 2 (Versicolor and Virginica)

DIris, LIris = load_iris()
D = DIris[:, LIris != 0]
L = LIris[LIris != 0]

(DTR, LTR), (DVAL, LVAL) = split_db_2to1(D, L)

In [108]:
classes = np.unique(LVAL)
S = np.zeros((len(classes), DVAL.shape[1]))

for i, label in enumerate(classes):
    mu = np.mean(DTR[:, LTR == label], axis=1, keepdims=True)
    C = np.cov(DTR[:, LTR == label], bias=True)

    S[i, :] = logpdf_GAU_ND(DVAL, mu, C)

# Compute the log-likelihood ratio
llr = S[1] - S[0]

assert np.allclose(llr, llr_MVG)

## Threshold

We assume uniform priors $P(C = 2) = P(C = 1) = 0.5$ so the threshold becomes `0`:

$t = \log \frac{P(C = 2)}{P(C = 1)} = 0$


In [109]:
predictions = llr > 0

accuracy = np.mean(predictions == (LVAL == 2))
error_rate = 1 - accuracy

print(f"Accuracy: {accuracy}")
print(f"Error rate: {error_rate}")

Accuracy: 0.9117647058823529
Error rate: 0.08823529411764708
