# 3.5 Assignment: Bayes Classifiers as Linear Discriminants

- Name: Congxin (David) Xu
- Computing ID: cx2rx

## Linear Discriminant Analysis Code.py

In [4]:
# This is code for Linear Discriminant Analysis
# Written by William F Basener
# University of Virginia, School of Data Science
# For use in teaching Bayesian Machine Learning
#
# The code currently computes the maximum likelihood classification
# Student is to add method to compute posterior probabilities and 
#   maximum probability classification

import pandas as pd
import numpy as np


def multivariate_gaussian_pdf(X, MU, SIGMA):
    """ Code from Data Blog 
    https://xavierbourretsicotte.github.io/MLE_Multivariate_Gaussian.html
    Maximum Likelihood Estimator: Multivariate Gaussian Distribution by 
    Xavier Bourret Sicotte, Fri 22 June 2018
    
    Returns the pdf of a multivariate Gaussian distribution
     - X, MU are p x 1 vectors
     - SIGMA is a p x p matrix """
    # Initialize and reshape
    X = X.reshape(-1, 1)
    MU = MU.reshape(-1, 1)
    p, _ = SIGMA.shape

    # Compute values
    SIGMA_inv = np.linalg.inv(SIGMA)
    denominator = np.sqrt((2 * np.pi) ** p * np.linalg.det(SIGMA))
    exponent = -(1 / 2) * ((X - MU).T @ SIGMA_inv @ (X - MU))

    # Return result
    return float((1. / denominator) * np.exp(exponent))


class LDA:
    """ Creates a class for Linear Discriminant Analysis
    Input:
        fname = file name for a csv file, must have one column 
        labeled "class" and the rest numeric data
    Methods:
        compute_probabilities = given an input observation computes the 
            likelihood for each class and the GML class
        compute_probabilities: given an input observation and prior 
            probabilities, computes the posterior probabilities for each 
            class and most probable class"""

    def __init__(self, fname):
        # reads the data and computes the statistics needed for classification

        # read the iris data as a Pandas data frame
        df = pd.read_csv(fname)

        # separate the class labels from the rest of the data
        # we are assuming the column name with class labels is 'Class'
        # and all other columns are numeric
        self.data_labels = df.loc[:]['Class']
        self.data = np.asarray(df.drop('Class', axis=1, inplace=False))

        # get information about the dimensions the data
        self.num_rows, self.num_cols = self.data.shape

        # get the class names as an array of strings
        self.class_names = np.unique(self.data_labels)

        # determine number of observations in each class
        self.num_obs = dict()
        for name in self.class_names:
            self.num_obs[name] = sum(self.data_labels == name)

        # compute the mean of each class
        self.means = dict()
        for name in self.class_names:
            self.means[name] = np.mean(self.data[self.data_labels == name, :], 0)

        # compute the mean covariance matrix
        self.cov = np.zeros([self.num_cols, self.num_cols])
        for name in self.class_names:
            self.cov = self.cov + self.num_obs[name] * \
                np.cov(np.transpose(self.data[self.data_labels == name, :]))
        self.cov = self.cov / self.num_rows

    def compute_likelihoods(self, x):
        # compute and output the likelihood of each class and the maximum likelihood class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.cov)
            idx = idx + 1

        # get the indices for sorting the likelihoods (in descending order)
        indices_sorted = np.argsort(likelihoods)[::-1]

        # print the predicted class and all class likelihoods
        print('LDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('LDA Class Likelihoods:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(likelihoods[indices_sorted[idx]]))

        # return the likelihoods
        return likelihoods

    def compute_probabilities(self, x, priors):
        # compute and output the probability of each class and the maximum probability class
        # compute and output the likelihood of each class and the maximum likelihood class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood and likelihood * prior of each class
        likelihoods = np.zeros(len(self.class_names))
        denom = np.zeros(len(self.class_names))
        idx = 0
        
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.cov)
            denom[idx] = (likelihoods[idx] * priors[name])
            idx = idx + 1
        
        # Calculate the sum of denom
        denom = sum(denom)
        
        # compute the probabilities of each class
        probabilities = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            probabilities[idx] = (likelihoods[idx] * priors[name]) / denom
            idx = idx + 1 
 
        # get the indices for sorting the probabilities (in descending order)
        indices_sorted = np.argsort(probabilities)[::-1]

        # print the predicted class and all class likelihoods
        print('LDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('LDA Class Probabilities:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(probabilities[indices_sorted[idx]]))

        # return the likelihoods
        return probabilities

In [6]:
model_lda = LDA('Exercise3.2_iris_data.csv')

Iris_setosa_observation = [5.1, 3.5, 1.4, 0.2]
print("=========================================")
model_lda.compute_likelihoods(Iris_setosa_observation)

uninformative_priors = {
    "Iris-setosa": 1 / 3,
    "Iris-versicolor": 1 / 3,
    "Iris-virginica": 1 / 3
}
print("=========================================")
model_lda.compute_probabilities(Iris_setosa_observation, 
                                uninformative_priors)
print("=========================================")
print(model_lda)

LDA Predicted Class: Iris-setosa
LDA Class Likelihoods:
Iris-setosa: 3.1595383581646694
Iris-versicolor: 9.957554809677293e-22
Iris-virginica: 5.2550710115931905e-42
LDA Predicted Class: Iris-setosa
LDA Class Probabilities:
Iris-setosa: 1.0
Iris-versicolor: 3.151585352317576e-22
Iris-virginica: 1.6632401369691825e-42
<__main__.LDA object at 0x00000191E7EE69C8>


## Quadratic Discriminant Analysis Code.py

In [7]:
# This is code for Quadratic Discriminant Analysis
# Written by William F Basener
# University of Virginia, School of Data Science
# For use in teaching Bayesian Machine Learning
#
# The code currently computes the maximum likelihood classification
# Student is to add method to compute posterior probabilities and maximum probability classification

import pandas as pd
import numpy as np


def multivariate_gaussian_pdf(X, MU, SIGMA):
    """Code from Data Blog https://xavierbourretsicotte.github.io/MLE_Multivariate_Gaussian.html
       Maximum Likelihood Estimator: Multivariate Gaussian Distribution
       by Xavier Bourret Sicotte, Fri 22 June 2018
       Returns the pdf of a multivariate Gaussian distribution
     - X, MU are p x 1 vectors
     - SIGMA is a p x p matrix"""
    # Initialize and reshape
    X = X.reshape(-1, 1)
    MU = MU.reshape(-1, 1)
    p, _ = SIGMA.shape

    # Compute values
    SIGMA_inv = np.linalg.inv(SIGMA)
    denominator = np.sqrt((2 * np.pi) ** p * np.linalg.det(SIGMA))
    exponent = -(1 / 2) * ((X - MU).T @ SIGMA_inv @ (X - MU))

    # Return result
    return float((1. / denominator) * np.exp(exponent))


class QDA:
    """Creates a class for Quadratic Discriminant Analysis
    Input:
        fname = file name for a csv file, must have one column labeled "class" and the rest numeric data
    Methods:
        compute_probabilities = given an input observation computes the likelihood for each class and the GML class
        compute_probabilities: given an input observation and prior probabilities,
            computes the posterior probabilities for each class and most probable class"""

    def __init__(self, fname):
        # reads the data and computes the statistics needed for classification

        # read the iris data as a Pandas data frame
        df = pd.read_csv(fname)

        # separate the class labels from the rest of the data
        # we are assuming the column name with class labels is 'Class'
        # and all other columns are numeric
        self.data_labels = df.loc[:]['Class']
        self.data = np.asarray(df.drop('Class', axis=1, inplace=False))

        # get information about the dimensions the data
        self.num_rows, self.num_cols = self.data.shape

        # get the class names as an array of strings
        self.class_names = np.unique(self.data_labels)

        # determine number of observations in each class
        self.num_obs = dict()
        for name in self.class_names:
            self.num_obs[name] = sum(self.data_labels == name)

        # compute the mean of each class
        self.means = dict()
        for name in self.class_names:
            self.means[name] = np.mean(self.data[self.data_labels == name, :], 0)

        # compute the covariance matrix of each class
        self.covs = dict()
        for name in self.class_names:
            self.covs[name] = np.cov(np.transpose(self.data[self.data_labels == name, :]))

    def compute_likelihoods(self, x):
        # compute and output the likelihood of each class and the maximum likelihood class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.covs[name])
            idx = idx + 1
        # get the indices for sorting the likelihoods (in descending order)
        indices_sorted = np.argsort(likelihoods)[::-1]

        # print the predicted class and all class likelihoods
        print('QDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('QDA Class Likelihoods:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(likelihoods[indices_sorted[idx]]))

        # return the likelihoods
        return likelihoods

    def compute_probabilities(self, x, priors):
        # compute and output the probability of each class and the maximum probability class
        
        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        denom = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.covs[name])
            denom[idx] =  likelihoods[idx] * priors[name]
            idx = idx + 1
        
        denom = sum(denom)
        
        # compute the probabilities of each class
        probabilities = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            probabilities[idx] = (likelihoods[idx] * priors[name]) / denom
            idx = idx + 1 
        
        
        # get the indices for sorting the likelihoods (in descending order)
        indices_sorted = np.argsort(probabilities)[::-1]

        # print the predicted class and all class likelihoods
        print('QDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('QDA Class Probabilities:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(probabilities[indices_sorted[idx]]))

        # return the probabilities
        return probabilities

In [8]:
model_qda = QDA('Exercise3.2_iris_data.csv')
print("=========================================")
Iris_setosa_observation = [5.1, 3.5, 1.4, 0.2]
model_qda.compute_likelihoods(Iris_setosa_observation)
print("=========================================")
uninformative_priors = {
    "Iris-setosa": 1 / 3,
    "Iris-versicolor": 1 / 3,
    "Iris-virginica": 1 / 3
}
model_qda.compute_probabilities(Iris_setosa_observation, uninformative_priors)
print("=========================================")
print(model_qda)

QDA Predicted Class: Iris-setosa
QDA Class Likelihoods:
Iris-setosa: 13.725594445123008
Iris-versicolor: 6.846866360095621e-25
Iris-virginica: 4.150482018069454e-40
QDA Predicted Class: Iris-setosa
QDA Class Probabilities:
Iris-setosa: 1.0
Iris-versicolor: 4.988393316930952e-26
Iris-virginica: 3.023899645777606e-41
<__main__.QDA object at 0x00000191E7EE6888>


## Exercise Q4
Your friend Carl calls you and says he has measured two iris owers and wants your help determining the species. The first flower has dimensions
[SepalLength; SepalWidth; PetalLength; PetalWidth] = [5.5, 2.4, 3.8, 1.1] and
the second has dimensions [5.5, 3.1, 5, 1.5]. Assuming noninformative pri-
ors, what are the most probable species according to LDA and QDA, and
what are the probabilities for each species.

### First Flower

In [9]:
first = [5.5, 2.4, 3.8, 1.1]

# LDA Model
model_lda.compute_probabilities(first, uninformative_priors)
print("=========================================")
# QDA Model
model_qda.compute_probabilities(first, uninformative_priors)

LDA Predicted Class: Iris-versicolor
LDA Class Probabilities:
Iris-versicolor: 0.9999970568617268
Iris-virginica: 2.9431382732285257e-06
Iris-setosa: 1.0221808316493793e-17
QDA Predicted Class: Iris-versicolor
QDA Class Probabilities:
Iris-versicolor: 0.9999701265523081
Iris-virginica: 2.9873447691956488e-05
Iris-setosa: 2.711263447644604e-52


array([2.71126345e-52, 9.99970127e-01, 2.98734477e-05])

According to the LDA model, the most probable species for the first flower is `Iris-versicolor` with probability of `0.9999970568617268`. According to the QDA model, the most probable species for the first flower is also `Iris-versicolor` with probability of `0.23990286017935306`. 

In [10]:
second = [5.5, 3.1, 5, 1.5]

# LDA Model
model_lda.compute_probabilities(second, uninformative_priors)
print("=========================================")
# QDA Model
model_qda.compute_probabilities(second, uninformative_priors)

LDA Predicted Class: Iris-versicolor
LDA Class Probabilities:
Iris-versicolor: 0.5602348418832768
Iris-virginica: 0.43976515811672334
Iris-setosa: 2.4269307963451542e-28
QDA Predicted Class: Iris-virginica
QDA Class Probabilities:
Iris-virginica: 0.5648248061588742
Iris-versicolor: 0.43517519384112596
Iris-setosa: 2.726593926101136e-103


array([2.72659393e-103, 4.35175194e-001, 5.64824806e-001])

According to the LDA model, the most probable species for the second flower is `Iris-versicolor` with probability of `0.5602348418832768`. According to the QDA model, the most probable species for the second flower is `Iris-virginica` with probability of `0.0014195619166499048`. 

## Exercise Q5
You realize Carl is working in the country of Bagend, and in this country
70% of the irises are Iris-virginica, 20% are Iris-versicolor, and 10% are
Iris-setosa. Use this information to create informative priors, and use your
Python code to provide an updated answer to the previous question.

In [11]:
informative_priors = {
    "Iris-setosa": 0.1,
    "Iris-versicolor": 0.2,
    "Iris-virginica": 0.7
}

first = [5.5, 2.4, 3.8, 1.1]

# LDA Model
model_lda.compute_probabilities(first, informative_priors)
print("=========================================")
# QDA Model
model_qda.compute_probabilities(first, informative_priors)


LDA Predicted Class: Iris-versicolor
LDA Class Probabilities:
Iris-versicolor: 0.9999896990918361
Iris-virginica: 1.0300908163807175e-05
Iris-setosa: 5.1108665532794896e-18
QDA Predicted Class: Iris-versicolor
QDA Class Probabilities:
Iris-versicolor: 0.9998954507411951
Iris-virginica: 0.00010454925880481234
Iris-setosa: 1.355530487899489e-52


array([1.35553049e-52, 9.99895451e-01, 1.04549259e-04])

According to the LDA model, the most probable species for the first flower is `Iris-versicolor` with probability of `0.9999896990918361`. According to the QDA model, the most probable species for the first flower is also `Iris-versicolor` with probability of `0.14394171610761183`.

In [12]:
second = [5.5, 3.1, 5, 1.5]

# LDA Model
model_lda.compute_probabilities(second, informative_priors)
print("=========================================")
# QDA Model
model_qda.compute_probabilities(second, informative_priors)

LDA Predicted Class: Iris-virginica
LDA Class Probabilities:
Iris-virginica: 0.7331468987640914
Iris-versicolor: 0.26685310123590855
Iris-setosa: 5.780022600098925e-29
QDA Predicted Class: Iris-virginica
QDA Class Probabilities:
Iris-virginica: 0.819583745748151
Iris-versicolor: 0.18041625425184904
Iris-setosa: 5.651997976619515e-104


array([5.65199798e-104, 1.80416254e-001, 8.19583746e-001])

According to the LDA model, the most probable species for the second flower is `Iris-virginica` with probability of `0.7331468987640914`. According to the QDA model, the most probable species for the second flower is also `Iris-virginica` with probability of `0.0029810800249648003`. 