# Question 1: Bayes Classifier
Implement Bayes classifier for three datasets under four covariance assumptions.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Utility: load class data from text file
def load_class_data(path, label):
    data = np.loadtxt(path)  # two columns
    labels = np.full((data.shape[0],), label)
    return data, labels

# Utility: split each class into train and test sets (70/30)
def split_data(X, y, test_size=0.3, random_state=42):
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

# Estimate Gaussian parameters for each class
def estimate_parameters(X, y):
    classes = np.unique(y)
    params = {}
    for c in classes:
        Xc = X[y==c]
        mu = Xc.mean(axis=0)
        cov = np.cov(Xc, rowvar=False)
        params[c] = {'mu': mu, 'cov': cov}
    return params

# Gaussian density
def gaussian_pdf(x, mu, cov):
    d = len(mu)
    det = np.linalg.det(cov)
    inv = np.linalg.inv(cov)
    norm = 1 / np.sqrt((2*np.pi)**d * det)
    diff = x - mu
    return norm * np.exp(-0.5 * diff @ inv @ diff.T)

# Bayes classifier general function
def predict_bayes(X, params, priors, cov_type='class_full', sigma2=None, shared_cov=None):
    # cov_type: 'spherical', 'shared_full', 'class_diag', 'class_full'
    classes = list(params.keys())
    y_pred = []
    for x in X:
        post = []
        for c in classes:
            mu = params[c]['mu']
            if cov_type=='spherical': cov = sigma2 * np.eye(len(mu))
            elif cov_type=='shared_full': cov = shared_cov
            elif cov_type=='class_diag': cov = np.diag(np.diag(params[c]['cov']))
            else: cov = params[c]['cov']
            post.append(priors[c] * gaussian_pdf(x, mu, cov))
        y_pred.append(classes[np.argmax(post)])
    return np.array(y_pred)

# Common function to evaluate and plot Bayes classifier for a dataset
def run_dataset(name, X, y):
    # split data
    X_train, X_test, y_train, y_test = split_data(X, y)
    # estimate parameters
    params = estimate_parameters(X_train, y_train)
    # compute class priors
    priors = {c: np.mean(y_train==c) for c in params}
    # spherical variance
    sigmas = [np.mean(np.diag(params[c]['cov'])) for c in params]
    sigma2 = np.mean(sigmas)
    # shared full covariance
    shared_cov = sum(params[c]['cov'] for c in params) / len(params)
    types = [
        ('spherical', {'sigma2': sigma2}),
        ('shared_full', {'shared_cov': shared_cov}),
        ('class_diag', {}),
        ('class_full', {})
    ]
    print(f'\n=== {name} ===')
    for cov_type, kwargs in types:
        y_pred = predict_bayes(X_test, params, priors, cov_type=cov_type, **kwargs)
        print(f'-- {cov_type} --')
        print('Accuracy:', accuracy_score(y_test, y_pred))
        print('Precision:', precision_score(y_test, y_pred, average=None))
        print('Recall:', recall_score(y_test, y_pred, average=None))
        print('F1:', f1_score(y_test, y_pred, average=None))
        print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

In [2]:
# Dataset 1: Linearly separable classes
files = ['LS_Group12/Class1.txt', 'LS_Group12/Class2.txt', 'LS_Group12/Class3.txt']
X1 = np.vstack([load_class_data(f, i)[0] for i,f in enumerate(files)])
y1 = np.concatenate([load_class_data(f, i)[1] for i,f in enumerate(files)])
run_dataset('Linearly separable', X1, y1)


=== Linearly separable ===
-- spherical --
Accuracy: 1.0
Precision: [1. 1. 1.]
Recall: [1. 1. 1.]
F1: [1. 1. 1.]
Confusion matrix:
 [[150   0   0]
 [  0 150   0]
 [  0   0 150]]
-- shared_full --
Accuracy: 1.0
Precision: [1. 1. 1.]
Recall: [1. 1. 1.]
F1: [1. 1. 1.]
Confusion matrix:
 [[150   0   0]
 [  0 150   0]
 [  0   0 150]]
-- class_diag --
Accuracy: 1.0
Precision: [1. 1. 1.]
Recall: [1. 1. 1.]
F1: [1. 1. 1.]
Confusion matrix:
 [[150   0   0]
 [  0 150   0]
 [  0   0 150]]
-- class_full --
Accuracy: 1.0
Precision: [1. 1. 1.]
Recall: [1. 1. 1.]
F1: [1. 1. 1.]
Confusion matrix:
 [[150   0   0]
 [  0 150   0]
 [  0   0 150]]


In [3]:
# Dataset 2: Nonlinearly separable classes
def load_nls_data(path):
    with open(path) as f: counts = list(map(int, f.readline().split()))
    data = np.loadtxt(path, skiprows=1)
    X = data[:, :2]
    y = np.concatenate([np.full(c, i) for i, c in enumerate(counts)])
    return X, y
X2, y2 = load_nls_data('NLS_Group12.txt')
run_dataset('Nonlinearly separable', X2, y2)

ValueError: invalid literal for int() with base 10: 'First'

In [None]:
# Dataset 3: Real-world vowel data
files = ['rd_group12/class1.txt', 'rd_group12/class2.txt', 'rd_group12/class3.txt']
X3 = np.vstack([load_class_data(f, i)[0] for i,f in enumerate(files)])
y3 = np.concatenate([load_class_data(f, i)[1] for i,f in enumerate(files)])
run_dataset('Vowel data', X3, y3)