In [1]:
import numpy as np
import pandas as pd

In [2]:
#1. Write a Python code to calculae the linear discriminant function (LDF) for several classes.
#   Your code should be able to predict the Y class based on the input X0 values.
#X1~X3: Species 1~3
#Assume equal priors, equal misclassification costs
#Classify to the class with largest LDF value

def my_lda(x,y):
    var = x.columns
    x=pd.DataFrame(pd.DataFrame.reset_index(x.T, drop=True)).T
    X1=x.iloc[:50]
    X2=x.iloc[50:100]
    X3=x.iloc[100:]
    
    #Calculate sample means and covariances
    n1, n2, n3 = y.value_counts()
    n = n1 + n2 + n3
    p=1/3 #Assume equal priors
    X1_mean=pd.DataFrame(X1.mean())
    X2_mean=pd.DataFrame(X2.mean())
    X3_mean=pd.DataFrame(X3.mean())
    S1=X1.cov()
    S2=X2.cov()
    S3=X3.cov()
    Sp = 1/(n1+n2+n3-3)*((n1-1)*S1+(n2-1)*S2+(n3-1)*S3)
    
    #Dataframe for classification
    classified_lda = pd.DataFrame(columns = ['Classified into TYPE'])
    
    #Calculate LDFs
    #d: slopes
    #m: constants
    d1 = X1_mean.T.dot(np.linalg.inv(Sp))
    d2 = X2_mean.T.dot(np.linalg.inv(Sp))
    d3 = X3_mean.T.dot(np.linalg.inv(Sp))
    m1 = -1/2*X1_mean.T.dot(np.linalg.inv(Sp)).dot(X1_mean).iloc[0,0]+np.log(p)
    m2 = -1/2*X2_mean.T.dot(np.linalg.inv(Sp)).dot(X2_mean).iloc[0,0]+np.log(p)
    m3 = -1/2*X3_mean.T.dot(np.linalg.inv(Sp)).dot(X3_mean).iloc[0,0]+np.log(p)
    
    #Coefficients of LDFs
    coeff1 = np.append(m1,d1)
    coeff2 = np.append(m2,d2)
    coeff3 = np.append(m3,d3)
    coeff1 = pd.DataFrame(coeff1, index = np.append('Constant',var), columns = ['Coefficients'])
    coeff2 = pd.DataFrame(coeff2, index = np.append('Constant',var), columns = ['Coefficients'])
    coeff3 = pd.DataFrame(coeff3, index = np.append('Constant',var), columns = ['Coefficients'])

    for i in range(len(x)):
        x0=pd.Series.reset_index(x.iloc[i,:], drop=True)
        #linear discriminant functions for population i (i=1,2,...,g)
        d1_val = X1_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X1_mean.T.dot(np.linalg.inv(Sp)).dot(X1_mean).iloc[0,0]+np.log(p)
        d2_val = X2_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X2_mean.T.dot(np.linalg.inv(Sp)).dot(X2_mean).iloc[0,0]+np.log(p)
        d3_val = X3_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X3_mean.T.dot(np.linalg.inv(Sp)).dot(X3_mean).iloc[0,0]+np.log(p)
        max_ind = np.argmax([d1_val,d2_val,d3_val])
        classified_lda.loc[i,:] = max_ind+1 #save classification
    
    #Classification result
    result=pd.concat([y,classified_lda], axis=1)
    
    #Calculate accuracy of LDA (APER method)
    classified_lda.columns = ['Species']
    error_lda = classified_lda==pd.DataFrame(y)
    correct, incorrect = error_lda.value_counts()
    accuracy_lda = correct/(correct+incorrect)
    
    return coeff1, coeff2, coeff3, result, accuracy_lda

In [3]:
#2. Write a Python code to calculate the quadratic discriminant function (QDF) for several classes.
#   Your code should be able to predict the Y class based on the input X0 values.
#Assume equal priors, equal misclassification costs
#Classify to the class with largest QDF value

def my_qda(x,y,x_new):
    var = x.columns
    x=pd.DataFrame(pd.DataFrame.reset_index(x.T, drop=True)).T
    X1=x.iloc[:50]
    X2=x.iloc[50:100]
    X3=x.iloc[100:]
    X1=pd.DataFrame(pd.DataFrame.reset_index(X1.T, drop=True)).T
    X2=pd.DataFrame(pd.DataFrame.reset_index(X2.T, drop=True)).T
    X3=pd.DataFrame(pd.DataFrame.reset_index(X3.T, drop=True)).T
    
    #Calculate sample means and covariances
    n1, n2, n3 = y.value_counts()
    n = n1 + n2 + n3
    p=1/3 #Assume equal priors
    X1_mean=pd.DataFrame(X1.mean())
    X2_mean=pd.DataFrame(X2.mean())
    X3_mean=pd.DataFrame(X3.mean())
    S1=X1.cov()
    S2=X2.cov()
    S3=X3.cov()
    
    #Dataframe for classification and quadratic discriminant scores
    classified_qda = pd.DataFrame(columns = ['Classified into TYPE'])
    scores = pd.DataFrame(columns = ['QD score'])

    #Calculate QDFs
    for i in range(len(x)):
        x0=pd.DataFrame(x.iloc[i,:])
        x0.columns = [0]
        #quadratic discriminant functions for population i (i=1,2,...,g)
        d1_val = -1/2*np.log(np.linalg.det(S1))-1/2*((x0-X1_mean).T.dot(np.linalg.inv(S1))).dot(x0-X1_mean).iloc[0,0]+np.log(p)
        d2_val = -1/2*np.log(np.linalg.det(S2))-1/2*((x0-X2_mean).T.dot(np.linalg.inv(S2))).dot(x0-X2_mean).iloc[0,0]+np.log(p)
        d3_val = -1/2*np.log(np.linalg.det(S3))-1/2*((x0-X3_mean).T.dot(np.linalg.inv(S3))).dot(x0-X3_mean).iloc[0,0]+np.log(p)
        max_ind = np.argmax([d1_val,d2_val,d3_val])
        classified_qda.loc[i,:] = max_ind+1 #save classification
        scores.loc[i,:] = np.round(max([d1_val,d2_val,d3_val]),4) #save scores
    
    #Classification result
    result=pd.concat([y,classified_qda,scores], axis=1)
    
    #Classification for new observation
    result_new = pd.DataFrame(index=['x_new'],columns = ['Classified into TYPE','QD score'])
    d1_new = -1/2*np.log(np.linalg.det(S1))-1/2*((x_new-X1_mean).T.dot(np.linalg.inv(S1))).dot(x_new-X1_mean).iloc[0,0]+np.log(p)
    d2_new = -1/2*np.log(np.linalg.det(S2))-1/2*((x_new-X2_mean).T.dot(np.linalg.inv(S2))).dot(x_new-X2_mean).iloc[0,0]+np.log(p)
    d3_new = -1/2*np.log(np.linalg.det(S3))-1/2*((x_new-X3_mean).T.dot(np.linalg.inv(S3))).dot(x_new-X3_mean).iloc[0,0]+np.log(p)
    max_new = np.argmax([d1_new,d2_new,d3_new])
    result_new.loc[:,:]=[max_new+1,np.round(max([d1_new,d2_new,d3_new]),4)] #save classification and score
    
    #Calculate accuracy of QDA (APER method)
    classified_qda.columns = ['Species']
    error_qda = classified_qda==pd.DataFrame(y)
    correct, incorrect = error_qda.value_counts()
    accuracy_qda = correct/(correct+incorrect)
    
    return result, result_new, accuracy_qda

In [4]:
#3. Write a Python code to perform the 'leave-one-out' method to calculate the accuracy of the LDA & QDA model you wrote in #1 & #2.
#Assume equal priors, equal misclassification costs

def loo(x,y):
    var = x.columns
    x = pd.DataFrame.reset_index(x.T,drop=True).T
    p = 1/3
    classified_ldf = pd.DataFrame(columns = ['Species'])
    classified_qdf = pd.DataFrame(columns = ['Species'])
    
    for i in range(len(x)):
        n1, n2, n3 = y.value_counts()
        if i<50:
            n1=n1-1
            n = n1 + n2 + n3
            x_temp = x.drop(i)
            x_temp = pd.DataFrame.reset_index(x_temp,drop=True)
            X1 = x_temp.iloc[:49]
            X1 = pd.DataFrame.reset_index(X1,drop=True)
            X2 = x_temp.iloc[49:99]
            X2 = pd.DataFrame.reset_index(X2,drop=True)
            X3 = x_temp.iloc[99:]
            X3 = pd.DataFrame.reset_index(X3,drop=True)
        elif i<100:
            n2=n2-1
            n = n1 + n2 + n3
            x_temp = x.drop(i)
            x_temp = pd.DataFrame.reset_index(x_temp,drop=True)
            X1 = x_temp.iloc[:50]
            X1 = pd.DataFrame.reset_index(X1,drop=True)
            X2 = x_temp.iloc[50:99]
            X2 = pd.DataFrame.reset_index(X2,drop=True)
            X3 = x_temp.iloc[99:]
            X3 = pd.DataFrame.reset_index(X3,drop=True)
        elif i<150:
            n3=n3-1
            n = n1 + n2 + n3
            x_temp = x.drop(i)
            x_temp = pd.DataFrame.reset_index(x_temp,drop=True)
            X1 = x_temp.iloc[:50]
            X1 = pd.DataFrame.reset_index(X1,drop=True)
            X2 = x_temp.iloc[50:100]
            X2 = pd.DataFrame.reset_index(X2,drop=True)
            X3 = x_temp.iloc[100:]
            X3 = pd.DataFrame.reset_index(X3,drop=True)
        X1_mean=pd.DataFrame(X1.mean())
        X2_mean=pd.DataFrame(X2.mean())
        X3_mean=pd.DataFrame(X3.mean())
        S1=X1.cov()
        S2=X2.cov()
        S3=X3.cov()
        Sp = 1/(n1+n2+n3-3)*((n1-1)*S1+(n2-1)*S2+(n3-1)*S3)
        
        #Calculate LDFs for multiple classes
        x0=pd.Series.reset_index(x.iloc[i,:], drop=True)
        d1_val = X1_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X1_mean.T.dot(np.linalg.inv(Sp)).dot(X1_mean).iloc[0,0]+np.log(p)
        d2_val = X2_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X2_mean.T.dot(np.linalg.inv(Sp)).dot(X2_mean).iloc[0,0]+np.log(p)
        d3_val = X3_mean.T.dot(np.linalg.inv(Sp)).dot(x0)[0]-1/2*X3_mean.T.dot(np.linalg.inv(Sp)).dot(X3_mean).iloc[0,0]+np.log(p)
        max_ind_ldf = np.argmax([d1_val,d2_val,d3_val])
        classified_ldf.loc[i,:] = max_ind_ldf+1 #save classification

        #Calculate QDFs for multiple classes
        x0=pd.DataFrame(x.iloc[i,:])
        x0.columns = [0]
        d1_val = -1/2*np.log(np.linalg.det(S1))-1/2*((x0-X1_mean).T.dot(np.linalg.inv(S1))).dot(x0-X1_mean).iloc[0,0]+np.log(p)
        d2_val = -1/2*np.log(np.linalg.det(S2))-1/2*((x0-X2_mean).T.dot(np.linalg.inv(S2))).dot(x0-X2_mean).iloc[0,0]+np.log(p)
        d3_val = -1/2*np.log(np.linalg.det(S3))-1/2*((x0-X3_mean).T.dot(np.linalg.inv(S3))).dot(x0-X3_mean).iloc[0,0]+np.log(p)
        max_ind_qdf = np.argmax([d1_val,d2_val,d3_val])
        classified_qdf.loc[i,:] = max_ind_qdf+1 #save classification
    
    #Calculate accuracy of LDA (LOO method)
    error_lda = classified_ldf==pd.DataFrame(y)
    correct, incorrect = error_lda.value_counts()
    acc_lda_loo = correct/(correct+incorrect)
    
    #Calculate accuracy of QDA (LOO method)
    error_qda = classified_qdf==pd.DataFrame(y)
    correct, incorrect = error_qda.value_counts()
    acc_qda_loo = correct/(correct+incorrect)
    
    return acc_lda_loo, acc_qda_loo

In [5]:
#4. Using Fisher's Iris data (Table 11.5), answer the following questions.
iris = pd.read_fwf('iris.dat', header=None, delim_whitespace = True, names=['Sepal length(x1)','Sepal width(x2)','Petal length(x3)','Petal width(x4)','Species'])
iris

Unnamed: 0,Sepal length(x1),Sepal width(x2),Petal length(x3),Petal width(x4),Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,3
146,6.3,2.5,5.0,1.9,3
147,6.5,3.0,5.2,2.0,3
148,6.2,3.4,5.4,2.3,3


In [6]:
x_iris = iris.drop('Species', axis=1)
y_iris = iris['Species']

In [7]:
#(a) Is the assumption of a common covariance matrix reasonable in this case?
#    (Use Python's statsmodels.stats.multivariate' module for this question.)
from statsmodels.stats import multivariate as mv

x_test=pd.DataFrame(pd.DataFrame.reset_index(x_iris.T, drop=True)).T
X1_test=x_test.iloc[:50]
X2_test=x_test.iloc[50:100]
X3_test=x_test.iloc[100:]

cov1 = X1_test.cov()
cov2 = X2_test.cov()
cov3 = X3_test.cov()

In [8]:
test = mv.test_cov_oneway([cov1,cov2,cov3],[len(X1_test),len(X2_test),len(X3_test)])
print("Chi-Square Test statistic:",test.statistic_chi2, " , Pr > ChiSq:",test.pvalue_chi2)

#Since p-value is small enough, we reject H0 => QDA

Chi-Square Test statistic: 140.94304992349774  , Pr > ChiSq: 3.352034178317213e-20


In [9]:
#(b) Assuming that the populations are multivariate normal, calculate the quadratic discriminant scores with equal prior and equal misclassification cost.
#    Classify the new observation x_new = [5.0, 3.5, 1.75, 0.21]' into population 1, 2, 3.
x_new = pd.DataFrame([5.0, 3.5, 1.75, 0.21])
result_qda, result_new, acc_qda_aper = my_qda(x_iris,y_iris,x_new)

In [10]:
#Classifications and respective quadratic discriminant scores of iris data
result_qda

Unnamed: 0,Species,Classified into TYPE,QD score
0,1,1,5.2105
1,1,1,4.3945
2,1,1,4.7929
3,1,1,4.582
4,1,1,5.0542
...,...,...,...
145,3,3,1.0922
146,3,3,1.3611
147,3,3,2.809
148,3,3,1.394


In [11]:
#Classification and quadratic discriminant scores of new observation
result_new

Unnamed: 0,Classified into TYPE,QD score
x_new,1,3.491


In [12]:
#(c) Assuming equal covariance matrices and multivariate normal populations, calculate the linear discriminant function using your code in #1 above
#    and compare its coefficients with those of Python's 'sklearn.discriminant_analysis' module.
coeff1, coeff2, coeff3, result_lda, acc_lda_aper = my_lda(x_iris,y_iris)

In [13]:
coeff1

Unnamed: 0,Coefficients
Constant,-86.30847
Sepal length(x1),23.544167
Sepal width(x2),23.58787
Petal length(x3),-16.430639
Petal width(x4),-17.398411


In [14]:
coeff2

Unnamed: 0,Coefficients
Constant,-72.852607
Sepal length(x1),15.698209
Sepal width(x2),7.07251
Petal length(x3),5.211451
Petal width(x4),6.434229


In [15]:
coeff3

Unnamed: 0,Coefficients
Constant,-104.36832
Sepal length(x1),12.445849
Sepal width(x2),3.68528
Petal length(x3),12.766545
Petal width(x4),21.079113


In [16]:
result_lda

Unnamed: 0,Species,Classified into TYPE
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
145,3,3
146,3,3
147,3,3
148,3,3


In [17]:
#Compare with Python package
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(priors = [1/3,1/3,1/3], solver = 'eigen') # Equal prior
lda.fit(x_iris,y_iris)

LinearDiscriminantAnalysis(priors=[0.3333333333333333, 0.3333333333333333,
                                   0.3333333333333333],
                           solver='eigen')

In [18]:
#Intercepts of LDFs
lda.intercept_

array([ -88.04744666,  -74.31697465, -106.47586504])

In [19]:
#Coefficients of LDFs
lda.coef_

array([[ 24.02465992,  24.06925561, -16.76595819, -17.75348039],
       [ 16.01858069,   7.21684677,   5.31780708,   6.56554   ],
       [ 12.69984591,   3.7604894 ,  13.02708671,  21.50929899]])

In [20]:
#(d) Calculate and compare the APER and the leave-one-out error rates for linear discriminant analysis (LDA) and quadratic discriminant analysis (QDA)
#    using your code in #1,2,3. (Assume equal prior and equal misclassification cost.)
#error=1-accuracy
#Leave-one-out (LOO)
acc_lda_loo, acc_qda_loo = loo(x_iris,y_iris)

In [21]:
#LDA accuracy (LOO)
acc_lda_loo

0.98

In [22]:
#QDA accuracy (LOO)
acc_qda_loo

0.9733333333333334

In [23]:
#Apparent error rates (APER)
#LDA accuracy (APER)
acc_lda_aper

0.98

In [24]:
#QDA accuracy (APER)
acc_qda_aper

0.98