In [1]:
import numpy as np
import random
import pandas as pd
from scipy.stats import multivariate_normal
from sklearn.decomposition import PCA
from IPython.display import display

In [2]:
def distance(x1, x2):
    return np.linalg.norm(x2 - x1)

def classes_dict(X, Y):
    classes_dict = {}
    for x, y in zip(X, Y):
        if y not in classes_dict:
            classes_dict[y] = []
        classes_dict[y].append(x)
    return {y : np.array(classes_dict[y]) for y in classes_dict} 


In [3]:
class Parkinsons:
    def __init__(self):
        self.data = np.loadtxt('parkinsons.csv', delimiter=',', skiprows=1, usecols=range(1, 24))
        self.X = np.delete(self.data, 16, axis=1)
        self.Y = self.data[:, 16].astype(int)
        self.cd = classes_dict(self.X, self.Y)
        
    def get_random_train_test(self, train_percentage=0.8):
        shuffled_cd = {y : np.random.permutation(self.cd[y]) for y in self.cd}
        class_splits = {y: int(train_percentage * len(self.cd[y])) for y in shuffled_cd}
        train_cd = {y: shuffled_cd[y][:class_splits[y]] for y in shuffled_cd}
        test_cd = {y: shuffled_cd[y][class_splits[y]:] for y in shuffled_cd}
        train = np.random.permutation([[x, y] for y in train_cd for x in train_cd[y]])
        test = np.random.permutation([[x, y] for y in test_cd for x in test_cd[y]])
        X_train = np.array([k[0] for k in train])
        Y_train = np.array([k[1] for k in train])
        X_test = np.array([k[0] for k in test])
        Y_test = np.array([k[1] for k in test])
        return X_train, Y_train, X_test, Y_test
    

In [4]:
class Tester:

    def __init__(self, model, X_test, Y_test):
        self.model = model
        self.X_test = X_test
        self.Y_test = Y_test
        self.total = len(X_test)
        
        self.__test_model()
        
    def __test_model(self):
        self.confusion = np.zeros((2, 2))
        
        for x, y in zip(self.X_test, self.Y_test):
            pred = self.model.predict(x)
            self.confusion[y, pred] += 1
    
    def confusion_matrix(self):
        return self.confusion
        
        
class Statistics:
    def __init__(self, matrices):
        self.m = np.array(matrices)
        self.m_sum = self.m.sum(axis=0)
    
    def mean_accuracy(self):
        return self.m_sum.trace() / self.m_sum.sum()
    
    def specificity(self):
        return self.m_sum[0, 0] / self.m_sum.sum(axis=0)[0]
    
    def sensibility(self):
        return self.m_sum[1, 1] / self.m_sum.sum(axis=0)[1]
    
    def print_all(self):
        print(self.m_sum)
        print('Mean accuracy: {}'.format(self.mean_accuracy()))
        print('Specificity: {}'.format(self.specificity()))
        print('Sensibility: {}'.format(self.sensibility()))


In [5]:
class KNN:
    
    def __init__(self, k=1):
        self.k = k
        self.values = None
    
    def train(self, X_train, Y_train):
        self.values = list(zip(X_train, Y_train))
        
    def predict(self, x):
        if self.values:
            sorted_values = sorted(self.values, key= lambda val : distance(val[0], x))
            k_nearest = sorted_values[:self.k]
            classes = np.array([k[1] for k in k_nearest])
            
            return np.bincount(classes).argmax()
    
        else:
            print('Not trained')
     

In [6]:
class DMC:
    
    def __init__(self):
        self.clusters = {}
        self.centroids = None
    
    def train(self, X_train, Y_train):
        for i in range(len(X_train)):
            if Y_train[i] not in self.clusters:
                self.clusters[Y_train[i]] = []
            self.clusters[Y_train[i]].append(X_train[i])
            
        self.clusters = {k : np.array(self.clusters[k]) for k in self.clusters}
        
        self.centroids = {k : np.mean(self.clusters[k], axis=0) for k in self.clusters}
        
    def predict(self, x):
        if self.centroids:
            return min(self.centroids, key=lambda c : distance(self.centroids[c], x))
    
        else:
            print('Not trained')

In [64]:
class CQG:
    
    def __init__(self):
        self.classes = None
        self.centroids = None
        self.covariances = None
        self.cov_invs = None
        self.cov_dets = None
        self.a_priori = None
        
    def __friedman_for_class(self, i, pooled, alpha):
        total = sum([len(c) for c in self.classes])
        c_len = len(self.classes[i])
        return ((1 - alpha) * c_len * self.covariances[i] + alpha * total * pooled) / ((1-alpha) * c_len + alpha * total)
        
    def regularize_friedman(self, alpha):
        total = sum([len(c) for c in self.classes])
        pooled = np.sum(np.array([(len(self.classes[i]) / total) * self.covariances[i] for i in range(len(self.classes))]), axis=0)
        self.covariances = [self.__friedman_for_class(i, pooled, alpha) for i in range(len(self.classes))]

    def __log_gaussian(self, class_n, x):
        cov_inv = self.cov_invs[class_n]
        cov_det = self.cov_dets[class_n]
        z = x - self.centroids[class_n]
        return - 0.5 * (np.dot(z, np.dot(cov_inv, z)) + np.log(cov_det))

    def a_posteriori(self, x):
        return np.array([self.__log_gaussian(i, x) for i in range(len(self.classes))])
    
    def train(self, X_train, Y_train):
        cd = classes_dict(X_train, Y_train)
        self.classes = [cd[y] for y in sorted(cd)]
        
        self.centroids = [np.mean(c, axis=0) for c in self.classes]
        self.covariances = [np.cov(c, rowvar=False).reshape((len(c[0]), -1)) for c in self.classes]
        
        if any([np.linalg.matrix_rank(cov_mat) != cov_mat.shape[0] for cov_mat in self.covariances]):
            self.regularize_friedman(0.3)
            
        self.cov_invs = [np.linalg.inv(cov) for cov in self.covariances]
        self.cov_dets = [np.linalg.det(cov) for cov in self.covariances]
        self.a_priori = np.array([ np.log(len(c)) for c in self.classes ])
        
    def predict(self, x):
        if self.centroids and self.covariances:
            probabilities = self.a_priori + self.a_posteriori(x)
            return probabilities.argmax()
        else:
            print('Not trained')

In [99]:
def getPCA(X, min_var):
    eigs = np.linalg.eigvals(np.dot(X.T, X))
    cs = np.cumsum(eigs / eigs.sum())
    n_components = len([x for x in cs if x < min_var]) + 1
    pca = PCA(n_components=n_components)
    pca.fit(X)
    return pca

def getLDA(X, Y, min_var):
    cd = classes_dict(X, Y)
    classes = [cd[y] for y in sorted(cd)]
    
    m = np.mean(np.array([x for c in classes for x in c]), axis=0)
    centroids = [np.mean(c, axis=0) for c in classes]
    covariances = [np.cov(c, rowvar=False) for c in classes]
    
    Sw = sum([len(classes[i]) * covariances[i] for i in range(len(classes))])
    Sw_inv = np.linalg.inv(Sw)
    n_m = [(c - m).reshape(1, -1) for c in centroids]
    Sb = sum([np.dot(n_m[i].T, n_m[i]) for i in range(len(classes))])
    
    eig = np.linalg.eig(np.dot(Sw_inv, Sb))
    eig_vals = eig[0]
    cs = np.cumsum(eig_vals / eig_vals.sum())
    n_components = len([x for x in cs if x < min_var]) + 1
    T = (eig[1][:, n_components]).reshape(-1, 1)
    return np.real(T)
    

In [102]:
parkinsons = Parkinsons()
matrices = []
for i in range(100):
    X_train, Y_train, X_test, Y_test = parkinsons.get_random_train_test()
    #pca = getPCA(X_train, 0.999)
    #X_train = pca.transform(X_train)
    #X_test = pca.transform(X_test)
    T = getLDA(X_train, Y_train, 0.999)
    X_train = X_train.dot(T)
    X_test = X_test.dot(T)
    
    #print(X_train, X_test)
    model = CQG()
    model.train(X_train, Y_train)
    tester = Tester(model, X_test, Y_test)
    matrices.append(tester.confusion_matrix())

In [103]:
statistics = Statistics(matrices)
statistics.print_all()

[[   0. 1000.]
 [   0. 3000.]]
Mean accuracy: 0.75
Specificity: nan
Sensibility: 0.75


