In [56]:
import numpy as np
from scipy.stats import multivariate_normal

In [85]:
# Load the wine data set
# Characteristics: 11 features, 4898 samples, 11 classes (0-10)

# Load the data set
wine = np.loadtxt('datasets/winequality/winequality-white.csv', delimiter=';', skiprows=1)

# extract the data and labels
wine_data = []
wine_labels = []

for row in wine:
    wine_data.append(row[:-1])
    wine_labels.append(row[-1])

# convert to numpy arrays
wine_data = np.array(wine_data)
wine_labels = np.array(wine_labels)

wine_possible_labels = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [86]:
# Load the HAR data set
# Characteristics: 561 features, 10299 samples, 6 classes (1-6)

# Load the files
X_test = np.loadtxt('datasets/UCI HAR Dataset/test/X_test.txt')
y_test = np.loadtxt('datasets/UCI HAR Dataset/test/y_test.txt')
X_train = np.loadtxt('datasets/UCI HAR Dataset/train/X_train.txt')
y_train = np.loadtxt('datasets/UCI HAR Dataset/train/y_train.txt')

# format the data set so that all of this data is in one data set due to the given charactersitics
har_data = np.concatenate((X_test, X_train), axis=0)
har_labels = np.concatenate((y_test, y_train), axis=0)

har_possible_labels = np.array([1, 2, 3, 4, 5, 6])


In [87]:
# implement minimum-probability-of-error classifier, assuming the class conditional pdfs are Gaussian
# using all available samples from a class, with sample averages, estimate mean vectors and covariance matrices
# using sample counts, also estimate class priors

# calculate mean vectors, covariance matrices, and priors for each class
# also return the unique labels
def calculate_parameters(data: np.array, labels: np.array) -> tuple[np.array, np.array, np.array, np.array]:
    """
    Calculates the mean vectors, covariance matrices, and priors for each class.

    Args:
        data (np.array): The data set.
        labels (np.array): The labels for the data set.

    Returns:
        tuple[np.array, np.array, np.array, np.array]: The unique labels, mean vectors, covariance matrices, and priors.
    """
    # get the unique labels
    unique_labels = np.unique(labels)

    # calculate the mean vectors
    mean_vectors = []
    for label in unique_labels:
        mean_vectors.append(np.mean(data[labels == label], axis=0))

    # calculate the covariance matrices
    covariance_matrices = []
    for label in unique_labels:
        covariance_matrices.append(np.cov(data[labels == label].T))

    # calculate the priors
    priors = []
    for label in unique_labels:
        priors.append(np.sum(labels == label) / len(labels))

    # convert to numpy arrays
    mean_vectors = np.array(mean_vectors)
    covariance_matrices = np.array(covariance_matrices)
    priors = np.array(priors)

    return unique_labels, mean_vectors, covariance_matrices, priors

In [88]:
# mean vector and covariance matrix for wine data set
wine_unique_labels, wine_mean_vectors, wine_covariance_matrices, wine_priors = calculate_parameters(wine_data, wine_labels)

# mean vector and covariance matrix for HAR data set
har_unique_labels, har_mean_vectors, har_covariance_matrices, har_priors = calculate_parameters(har_data, har_labels)

In [89]:
# add a regularization term to the covariance matrices to ensure the regularized covaraince matrix has all eigenvalues larger than 0
# this is done by adding a small value to the diagonal of the covariance matrix
# for now, we'll use a value on the order of arithmetic average of sample covariance matrices
def regularize_covariance_matrices(covariance_matrices: np.array) -> np.array:
    """
    Regularizes the covariance matrices.

    Args:
        covariance_matrices (np.array): The covariance matrices.

    Returns:
        np.array: The regularized covariance matrices.
    """
    # calculate the average covariance matrix
    average_covariance_matrix = np.mean(covariance_matrices, axis=0)

    # calculate the regularization term
    regularization_term = np.mean(np.diag(average_covariance_matrix))

    print("The regularization term is: ", regularization_term)

    # add the regularization term to the covariance matrices
    for i in range(len(covariance_matrices)):
        covariance_matrices[i] += regularization_term * np.eye(covariance_matrices[i].shape[0])

    return covariance_matrices

In [90]:
# add the regularization term to the covariance matrices
wine_covariance_matrices = regularize_covariance_matrices(wine_covariance_matrices)
har_covariance_matrices = regularize_covariance_matrices(har_covariance_matrices)

The regularization term is:  353.1966224105839
The regularization term is:  0.03549356441569865


In [103]:
# implement the minimum-probability-of-error classifier
def minimum_probability_of_error_classifier(x: np.array, unique_labels: np.array, mean_vectors: np.array, covariance_matrices: np.array, priors: np.array) -> int:
    """
    Implements the minimum-probability-of-error classifier.

    Args:
        x (np.array): The data point to classify.
        unique_labels (np.array): The unique, used labels for the data set.
        mean_vectors (np.array): The mean vectors for each class.
        covariance_matrices (np.array): The covariance matrices for each class.
        priors (np.array): The priors for each class.

    Returns:
        int: The predicted label.
    """
    # initialize the probabilities
    probabilities = []

    # iterate through the classes
    for i in range(len(unique_labels)):
        # calculate the probability
        probability = multivariate_normal.pdf(x, mean_vectors[i], covariance_matrices[i]) * priors[i]

        # add the probability to the list
        probabilities.append(probability)

    # find the maximum probability
    max_probability = max(probabilities)

    # find the index of the maximum probability
    max_probability_index = probabilities.index(max_probability)

    # use the max_probability_index to find the predicted label
    predicted_label = unique_labels[max_probability_index]

    return predicted_label


# implement a function that will classify a data set using the minimum-probability-of-error classifier
def classify_data_set(data: np.array, unique_labels: np.array, mean_vectors: np.array, covariance_matrices: np.array, priors: np.array) -> np.array:
    """
    Classifies a data set using the minimum-probability-of-error classifier.

    Args:
        data (np.array): The data set to classify.
        unique_labels (np.array): The unique, used labels for the data set.
        mean_vectors (np.array): The mean vectors for each class.
        covariance_matrices (np.array): The covariance matrices for each class.
        priors (np.array): The priors for each class.

    Returns:
        np.array: The predicted labels.
    """
    # initialize the predicted labels
    predicted_labels = []

    # iterate through the data set
    for i in range(len(data)):
        # classify the data point
        predicted_label = minimum_probability_of_error_classifier(data[i], unique_labels, mean_vectors, covariance_matrices, priors)

        # add the predicted label to the list
        predicted_labels.append(predicted_label)

    # convert to a numpy array
    predicted_labels = np.array(predicted_labels)

    return predicted_labels


# implement a function that will count the errors, the error probability estimate, and the confusion matrix
def calculate_classification_metrics(predicted_labels: np.array, actual_labels: np.array, possible_labels: np.array) -> tuple[int, float, np.array]:
    """
    Calculates the number of errors, the error probability estimate, and the confusion matrix.

    Args:
        predicted_labels (np.array): The predicted labels.
        actual_labels (np.array): The actual labels.
        possible_labels (np.array): All the possible labels for the data set.

    Returns:
        tuple[int, float, np.array]: The number of errors, the error probability estimate, and the confusion matrix.
    """
    # initialize the number of errors
    number_of_errors = 0

    # initialize the confusion matrix
    confusion_matrix = np.zeros((len(possible_labels), len(possible_labels)))

    # iterate through the predicted labels
    for i in range(len(predicted_labels)):
        # check if the predicted label is correct
        if predicted_labels[i] != actual_labels[i]:
            # increment the number of errors
            number_of_errors += 1

        # increment the confusion matrix
        actual_index = np.where(possible_labels == actual_labels[i])[0][0]
        predicted_index = np.where(possible_labels == predicted_labels[i])[0][0]
        confusion_matrix[actual_index, predicted_index] += 1

    # calculate the error probability estimate
    error_probability_estimate = number_of_errors / len(predicted_labels)

    return number_of_errors, error_probability_estimate, confusion_matrix

In [104]:
# classify the wine data set
wine_predicted_labels = classify_data_set(wine_data, wine_unique_labels, wine_mean_vectors, wine_covariance_matrices, wine_priors)

In [105]:
# calculate the classification metrics for the wine data set
wine_number_of_errors, wine_error_probability_estimate, wine_confusion_matrix = calculate_classification_metrics(wine_predicted_labels, wine_labels, wine_possible_labels)

# print the classification metrics for the wine data set
print("The number of errors for the wine data set is: ", wine_number_of_errors)
print("The error probability estimate for the wine data set is: ", wine_error_probability_estimate)
print("The confusion matrix for the wine data set is:")
print(wine_confusion_matrix)

The number of errors for the wine data set is:  2683
The error probability estimate for the wine data set is:  0.5477746018783177
The confusion matrix for the wine data set is:
[[0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 3.000e+00 0.000e+00 3.000e+00 1.400e+01
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 7.000e+00 1.550e+02
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 2.000e+00 0.000e+00 1.620e+02 1.293e+03
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.480e+02 2.050e+03
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e

In [106]:
# classify the HAR data set
har_predicted_labels = classify_data_set(har_data, har_unique_labels, har_mean_vectors, har_covariance_matrices, har_priors)

KeyboardInterrupt: 

In [None]:
# calculate the classification metrics for the HAR data set
har_number_of_errors, har_error_probability_estimate, har_confusion_matrix = calculate_classification_metrics(har_predicted_labels, har_labels, har_possible_labels)

# print the classification metrics for the HAR data set
print("The number of errors for the HAR data set is: ", har_number_of_errors)
print("The error probability estimate for the HAR data set is: ", har_error_probability_estimate)
print("The confusion matrix for the HAR data set is:")
print(har_confusion_matrix)

# TODO: add some sort of percentage completion for classification
# TODO: pretty print the confusion matrices
# TODO: visualize the data sets in various 2 or 3 dimensional projections
# TODO: discuss if gaussian class conditional densities are appropriate for the data sets
# TODO: discuss how model choice influences confusion matrix and probability of error
# TODO: explain modeling assumptions, how estimated/selected parameters