## Erik Bayerlein - 537606

In [169]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Algorithms

#### Auxiliary Algorithms

In [173]:
def show_results(title, results):
    print(title)
    print(f"Accuracy: {results[0]:.2f} ± {results[1]:.2f}")
    print(f"Accuracy per class: {results[2]}")
    print(f"Std per class: {results[3]}")

In [171]:
def normalize(X):
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    return (X - means) / stds

#### K Fold Cross Validation

In [174]:
def k_fold_cross_validation(X, y, model, k=10):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    accuracies = []
    accuracies_per_class = {c: [] for c in np.unique(y)}
    
    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else len(X)
        
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = np.mean(y_pred == y_test)
        accuracies.append(accuracy)
        
        for cls in np.unique(y):
            cls_indices = np.where(y_test == cls)
            cls_accuracy = np.mean(y_pred[cls_indices] == y_test[cls_indices])
            accuracies_per_class[cls].append(cls_accuracy)
    
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    avg_accuracy_per_class = {cls: np.mean(scores) for cls, scores in accuracies_per_class.items()}
    std_accuracy_per_class = {cls: np.std(scores) for cls, scores in accuracies_per_class.items()}
    
    return avg_accuracy, std_accuracy, avg_accuracy_per_class, std_accuracy_per_class

#### Gaussian Discriminant Analysis

In [175]:
class GaussianDiscriminantAnalysis:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.means = np.array([X[y == c].mean(axis=0) for c in self.classes])
        self.covariance = np.cov(X.T)
        self.priors = np.array([np.mean(y == c) for c in self.classes])

    def predict(self, X):
        probabilities = np.array([self._pdf(X, mean, self.covariance) for mean in self.means])
        return self.classes[np.argmax(probabilities, axis=0)]

    def _pdf(self, X, mean, covariance):
        cov_inv = np.linalg.inv(covariance)
        det_cov = np.linalg.det(covariance)
        norm_const = 1.0 / (np.sqrt((2 * np.pi) ** X.shape[1] * det_cov))
        X_mean = X - mean
        exponent = np.einsum('ij,ji->i', X_mean.dot(cov_inv), X_mean.T)
        return norm_const * np.exp(-0.5 * exponent)

#### Gaussian Naive Bayes

In [176]:
class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.means = np.array([X[y == c].mean(axis=0) for c in self.classes])
        self.vars = np.array([X[y == c].var(axis=0) for c in self.classes])
        self.priors = np.array([np.mean(y == c) for c in self.classes])

    def predict(self, X):
        likelihoods = np.array([self._pdf(X, mean, var) for mean, var in zip(self.means, self.vars)])
        posteriors = likelihoods * self.priors[:, np.newaxis]
        return self.classes[np.argmax(posteriors, axis=0)]

    def _pdf(self, X, mean, var):
        exponent = np.exp(- (X - mean) ** 2 / (2 * var))
        return exponent.prod(axis=1) / np.sqrt(2 * np.pi * var).prod()

#### Gradient Descent (GD) for Logistic Regression

In [177]:
class LogisticRegressionGD:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1] + 1)
        X = np.hstack([np.ones((X.shape[0], 1)), X])
        for _ in range(self.epochs):
            predictions = self.sigmoid(X.dot(self.theta))
            gradient = X.T.dot(predictions - y) / y.size
            self.theta -= self.learning_rate * gradient

    def predict(self, X):
        X = np.hstack([np.ones((X.shape[0], 1)), X])
        return self.sigmoid(X.dot(self.theta)) >= 0.5

#### Gradient Descent for Softmax Regression

In [178]:
class SoftmaxRegressionGD:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.num_classes = len(self.classes)
        y = y.astype(int)

        self.theta = np.zeros((X.shape[1] + 1, self.num_classes))
        X = np.hstack([np.ones((X.shape[0], 1)), X])
        y_one_hot = np.eye(self.num_classes)[y]
        
        for _ in range(self.epochs):
            scores = X.dot(self.theta)
            probs = self._softmax(scores)
            gradient = X.T.dot(probs - y_one_hot) / y.size
            self.theta -= self.learning_rate * gradient

    def _softmax(self, scores):
        exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    def predict(self, X):
        X = np.hstack([np.ones((X.shape[0], 1)), X])
        scores = X.dot(self.theta)
        probs = self._softmax(scores)
        return np.argmax(probs, axis=1)

------------

### Q1 - Logistic Regression, Gaussiam Discriminant Analysis and Gaussian Naive Bayes with Cross Validation in 10 folds

In [179]:
data = pd.read_csv('../../datasets/breastcancer.csv')

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

X = normalize(X)

In [180]:
naive_bayes_model = GaussianNaiveBayes()
gda_model = GaussianDiscriminantAnalysis()
logistic_model = LogisticRegressionGD()

In [181]:
results_logistic = k_fold_cross_validation(X, y, logistic_model, k=10)
results_gda = k_fold_cross_validation(X, y, gda_model, k=10)
results_nb = k_fold_cross_validation(X, y, naive_bayes_model, k=10)

In [182]:
show_results(title="Logistic Regression GD", results=results_logistic)
show_results(title="\nGaussian Discriminant Analysis:", results=results_gda)
show_results(title="\nNaive Bayes Gaussian:", results=results_nb)

Logistic Regression GD
Accuracy: 0.98 ± 0.01
Accuracy per class: {np.float64(0.0): np.float64(0.9922972972972973), np.float64(1.0): np.float64(0.9534490740740742)}
Std per class: {np.float64(0.0): np.float64(0.016239285392705603), np.float64(1.0): np.float64(0.04041569220538391)}

Gaussian Discriminant Analysis:
Accuracy: 0.96 ± 0.03
Accuracy per class: {np.float64(0.0): np.float64(0.9944444444444445), np.float64(1.0): np.float64(0.9030725755725756)}
Std per class: {np.float64(0.0): np.float64(0.016666666666666673), np.float64(1.0): np.float64(0.06382831867292928)}

Naive Bayes Gaussian:
Accuracy: 0.93 ± 0.04
Accuracy per class: {np.float64(0.0): np.float64(0.9509229225859548), np.float64(1.0): np.float64(0.8997144249512671)}
Std per class: {np.float64(0.0): np.float64(0.040545230193575005), np.float64(1.0): np.float64(0.0844156662980928)}


### Q2 - Softmax Regression, Gaussiam Discriminant Analysis and Gaussian Naive Bayes with Cross Validation in 10 folds

In [183]:
data = pd.read_csv('../../datasets/vehicle.csv')

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

X = normalize(X)

In [184]:
softmax_model = SoftmaxRegressionGD()
gda_model = GaussianDiscriminantAnalysis()
naive_bayes_model = GaussianNaiveBayes()

In [185]:
results_softmax = k_fold_cross_validation(X, y, softmax_model, k=10)
results_gda = k_fold_cross_validation(X, y, gda_model, k=10)
results_nb = k_fold_cross_validation(X, y, naive_bayes_model, k=10)

In [186]:
show_results(title="Softmax Regression GD", results=results_softmax)
show_results(title="\nGaussian Discriminant Analysis", results=results_gda)
show_results(title="\nNaive Bayes Gaussian", results=results_nb)

Softmax Regression GD
Accuracy: 0.71 ± 0.06
Accuracy per class: {np.float64(0.0): np.float64(0.8874297249491757), np.float64(1.0): np.float64(0.453252402568192), np.float64(2.0): np.float64(0.5553872977323017), np.float64(3.0): np.float64(0.9638200894392843)}
Std per class: {np.float64(0.0): np.float64(0.080555832027033), np.float64(1.0): np.float64(0.1150952110598815), np.float64(2.0): np.float64(0.12033386796419603), np.float64(3.0): np.float64(0.03865456318548034)}

Gaussian Discriminant Analysis
Accuracy: 0.77 ± 0.04
Accuracy per class: {np.float64(0.0): np.float64(0.9473471805953648), np.float64(1.0): np.float64(0.6141621811274813), np.float64(2.0): np.float64(0.5771690247205121), np.float64(3.0): np.float64(0.9596969696969697)}
Std per class: {np.float64(0.0): np.float64(0.0322853971193488), np.float64(1.0): np.float64(0.12023529096767378), np.float64(2.0): np.float64(0.1121957036597623), np.float64(3.0): np.float64(0.03653285542574816)}

Naive Bayes Gaussian
Accuracy: 0.44 ± 0.0