# Gaussian Discriminant Analysis
    Darren Lund
    Math 404
    February 9, 2018

In [27]:
import numpy as np
from math import sqrt
from scipy.linalg import inv
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from time import time

# Problem 1
Code up the Gaussian Discriminant Analysis algorithm.  Your code should have a .fit method that accepts a dataset X,y where y only takes on a finite number of values (classes), the .fit method should train the model (learn the parameters $\pi_c$, $\mu_c$, and  $\Sigma_c$ for each class c, using the standard Gaussian MLE for each $\mu_c$, and  $\Sigma_c$ and using the estimate $\pi_c$ = (#y=c)/N.  Your code should also have a `.predict_proba` method that accepts a data set X' and returns $p(y=c | x)$ for each x in X',  and it should have a `.predict` method that accepts data X' and returns the class prediction $\hat{y}$ for each x in X' 

In [49]:
class GDA(object) :
    def __init__(self) :
        '''
        Initialization of a Gaussian Discriminant Analysis classifier
        '''
        pass
    
    def fit(self,X,y) :
        '''
        A function for training the classifier.
        
        Input :
            X - Training data
            y - Classifications from data
                (finite number of classes, counting representation)
            
        Output :
            Instance of self with new data, namely pi, mu, and sigma for each class
        '''
        distinct_ys = set(y)
        self.num_classes = len(distinct_ys)
        self.pis = []
        self.mus = []
        self.sigs = []
        self.sigs_inv = []
        joined = np.hstack((X,np.array(y).reshape(-1,1)))
        for i in range(self.num_classes) :
            self.pis.append(list(y).count(i)/len(list(y)))
            filt = X[joined[:,-1] == i]
            self.mus.append(np.average(filt,axis=0))
            self.sigs.append(np.cov(filt.T))
            self.sigs_inv.append(inv(self.sigs[i]))
    
    def predict_proba(self,X) :
        '''
        Returns the probability of each class for the given data points in X
        
        Input :
            X - Data to predict
            
        Output :
            p(y=c|x) for each data point in X
        '''
        n = X.shape[0]
        probs = np.zeros((n,self.num_classes))
        for j in range(n) :
            prob = [self.pis[i]/sqrt(2*np.pi*np.linalg.det(self.sigs[i]))*np.exp(-0.5*(np.dot((X[j,:].T-self.mus[i]).T,np.dot(self.sigs_inv[i],X[j,:].T-self.mus[i])))) for i in range(self.num_classes)]
            probs[j,:] = prob/sum(prob)
        return probs
    
    def predict(self,X) :
        '''
        Predicts the labels for a given dataset X
        
        Input :
            X - Data to predict
            
        Output :
            y_hat - The prediction of each data point in X
        '''
        probs = self.predict_proba(X)
        y_hat = np.argmax(probs,axis=1)
        return y_hat

# Problem 2
Apply your GDA code to the cancer dataset with an appropriate train-test split and compare the results (train and test speed and test accuracy) to logistic regression and Naive Bayes.  Is one of these much better than the others?  Explain. 

In [50]:
cancer = load_breast_cancer()
x = cancer.data
y = cancer.target
tr_x, ts_x, tr_y, ts_y = train_test_split(x,y,test_size=0.3)

In [51]:
gda = GDA()
start = time()
gda.fit(tr_x,tr_y)
y_hat = gda.predict(ts_x)
g_score = sum([y_hat[i] == ts_y[i] for i in range(len(y_hat))])/len(y_hat)
end = time()
g_time = end - start

lgr = LogisticRegression()
start = time()
lgr.fit(tr_x,tr_y)
y_hat = lgr.predict(ts_x)
l_score = sum([y_hat[i] == ts_y[i] for i in range(len(y_hat))])/len(y_hat)
end = time()
l_time = end - start

nb = GaussianNB()
start = time()
nb.fit(tr_x,tr_y)
y_hat = nb.predict(ts_x)
n_score = sum([y_hat[i] == ts_y[i] for i in range(len(y_hat))])/len(y_hat)
end = time()
n_time = end - start

In [62]:
print('\tTime\t\t\tScore')
print('GDA :\t'+str(g_time)+'\t'+str(g_score))
print('LGR :\t'+str(l_time)+'\t'+str(l_score))
print('NB  :\t'+str(n_time)+'\t'+str(n_score))

	Time			Score
GDA :	0.19111394882202148	0.941520467836
LGR :	0.13195395469665527	0.93567251462
NB  :	0.004067182540893555	0.923976608187


In terms of speed, the Naive Bayes approach is by far the best.  However, it is also the least accurate, though not by much.  This is only true because of the size of the data, but for this data set, a small increase in time you can get almost an additional 2% accuracy increase by using the GDA.  Basically, I would say that no, there isn't one that's obviously better in all cases.  Assuming that you want quick results and are okay with a slgihtly lower accuracy, Naive Bayes is the way to go.  If accuracy is so important that you need that increase, however, and you have time on your hands, pick GDA.

# Problem 3
Compare your train and test speed and your test accuracy to the `discriminant_analysis.QuadraticDiscriminantAnalysis` method in scikit learn. 

In [57]:
sgda = QuadraticDiscriminantAnalysis()
start = time()
sgda.fit(tr_x,tr_y)
y_hat = sgda.predict(ts_x)
sg_score = sum([y_hat[i] == ts_y[i] for i in range(len(y_hat))])/len(y_hat)
end = time()
sg_time = end - start

In [63]:
print('\t Time\t\t\tScore')
print('My GDA : '+str(g_time)+'\t'+str(g_score))
print('SK GDA : '+str(sg_time)+'\t'+str(sg_score))

	 Time			Score
My GDA : 0.19111394882202148	0.941520467836
SK GDA : 1.3962910175323486	0.941520467836
