In [1]:
import numpy as np
import pandas as pd

In [2]:
#1
def mean_vector(data):
    ones=pd.DataFrame({'ones': np.ones(len(data))})
    mean=ones.transpose().dot(data)/len(data)
    return mean

def covariance_matrix(data):
    mean=mean_vector(data)
    mean_rep = pd.concat([mean]*len(data))
    mean_rep.columns=data.columns
    mean_rep.reset_index(inplace = True, drop = True)
    covariance=(data-mean_rep).transpose().dot(data-mean_rep)/(len(data)-1)
    return covariance

def sample_corr_matrix(data):
    S = covariance_matrix(data)
    Si, Sj = S.copy(), S.copy()
    for i in range(len(Si)):
        Si.iloc[i,:]=S.iloc[i,i]
    for j in range(len(Sj)):
        Sj.iloc[:,j]=S.iloc[j,j]
    
    R=S/np.sqrt(Si)/np.sqrt(Sj)
    return R

def classifier(data, class1, class2):
    for i in range(len(data.index)):
        for j in range(len(data.columns)):
            data.iloc[i,j] = class1 if data.iloc[i,j]==True else class2
    return data

In [3]:
#LDA: Linear Discriminant Analysis
#X1: domestic, X2: wild
#priors: p1, p2
def lda(x,y,p1,p2):
    var = x.columns
    x = pd.DataFrame.reset_index(x.T,drop=True).T
    X1 = x.iloc[14:]
    X1 = pd.DataFrame.reset_index(X1,drop=True)
    X2 = x.iloc[:14]
    X2 = pd.DataFrame.reset_index(X2,drop=True)
    
    n1, n2 = y.value_counts()
    n = n1 + n2
    X1_mean = mean_vector(X1)
    X2_mean = mean_vector(X2)
    S1 = covariance_matrix(X1)
    S2 = covariance_matrix(X2)
    Sp = (n1-1)/(n1+n2-2)*S1+(n2-1)/(n1+n2-2)*S2
    
    #Calculate LDF for binary class
    a = (X1_mean-X2_mean).dot(np.linalg.inv(Sp))
    Y1_mean = a.dot(X1_mean.T).iloc[0,0]
    Y2_mean = a.dot(X2_mean.T).iloc[0,0]
    m = 1/2*(Y1_mean + Y2_mean)
    coeff = np.append(m,a)
    coeff = pd.DataFrame(coeff, index = np.append('Constant',var), columns = ['Coefficients'])
    
    #Predict Y class
    ldf = a.dot(x.T)-m
    ldf.index=['ldf']
    log = np.log(p2/p1) #assume equal misclassification cost
    ldf_bool = ldf > log
    classified = classifier(ldf_bool,'DOMESTIC','WILD')
    
    #Calculate posterior probabilities
    X1_mean_rep = pd.concat([X1_mean]*len(x))
    X1_mean_rep = pd.DataFrame.reset_index(X1_mean_rep,drop=True)
    f1=np.exp(-1/2*(x-X1_mean_rep).dot(np.linalg.inv(Sp)).dot((x-X1_mean_rep).T))
    X2_mean_rep = pd.concat([X2_mean]*len(x))
    X2_mean_rep = pd.DataFrame.reset_index(X2_mean_rep,drop=True)
    f2=np.exp(-1/2*(x-X2_mean_rep).dot(np.linalg.inv(Sp)).dot((x-X2_mean_rep).T))
    post1 = p1*f1/(p1*f1+p2*f2)
    post2 = p2*f2/(p1*f1+p2*f2)
    
    #Classification result
    result=pd.concat([y,classified.T,pd.DataFrame(np.round(np.diag(post1),4)),pd.DataFrame(np.round(np.diag(post2),4))],axis=1)
    result.columns=['From TYPE','Classified into Type','DOMESTIC','WILD']
    
    return coeff, result, ldf

In [4]:
#2
#LOO: Leave-One-Out Method
def loo(x,y,p1,p2):
    var = x.columns
    x = pd.DataFrame.reset_index(x.T,drop=True).T
    classify_list=[]
    
    for i in range(len(x)):
        n1, n2 = y.value_counts()
        if i<14:
            n1=n1-1
            n=n1+n2
            x_temp = x.drop(i)
            x_temp = pd.DataFrame.reset_index(x_temp,drop=True)
            X1 = x_temp.iloc[13:]
            X1 = pd.DataFrame.reset_index(X1,drop=True)
            X2 = x_temp.iloc[:13]
            X2 = pd.DataFrame.reset_index(X2,drop=True)
        else:
            n2=n2-1
            n=n1+n2
            x_temp = x.drop(i)
            x_temp = pd.DataFrame.reset_index(x_temp,drop=True)
            X1 = x_temp.iloc[14:]
            X1 = pd.DataFrame.reset_index(X1,drop=True)
            X2 = x_temp.iloc[:14]
            X2 = pd.DataFrame.reset_index(X2,drop=True)
            
        X1_mean = mean_vector(X1)
        X2_mean = mean_vector(X2)
        S1 = covariance_matrix(X1)
        S2 = covariance_matrix(X2)
        Sp = (n1-1)/(n1+n2-2)*S1+(n2-1)/(n1+n2-2)*S2

        #Calculate LDF for binary class
        a = (X1_mean-X2_mean).dot(np.linalg.inv(Sp))
        Y1_mean = a.dot(X1_mean.T).iloc[0,0]
        Y2_mean = a.dot(X2_mean.T).iloc[0,0]
        m = 1/2*(Y1_mean + Y2_mean)
        coeff = np.append(m,a)
        coeff = pd.DataFrame(coeff, index = np.append('Constant',var), columns = ['Coefficients'])
        
        #Predict Y class
        ldf = a.dot(x.iloc[i,:])[0]-m
        log = np.log(p2/p1) #assume equal misclassification cost
        ldf_bool = ldf > log
        if ldf_bool>0:
            classified = 'DOMESTIC'
        else:
            classified = 'WILD'
        classify_list = np.append(classify_list, classified)
        
    classify_list = pd.DataFrame(classify_list, columns=['TYPE'])
    #calculate accuracy
    error = classify_list==pd.DataFrame(y)
    correct, incorrect = error.value_counts()
    accuracy = correct/(correct+incorrect)
    
    return accuracy

In [5]:
#3
turkey = pd.read_csv("turkey.dat", delim_whitespace = True)
turkey

Unnamed: 0,ID,SEX,TYPE,WGT,HUM,RAD,ULN,FEMUR,TIB,TIN,CAR,D3P,STL,STB,COR,PEL,MAX,MIN,SCA
0,K766,MALE,WILD,.,.,.,.,.,.,.,.,.,.,.,.,.,142,107,.
1,N399,MALE,WILD,.,153,138,153,139,246,162,810,307,196,74,.,.,145,104,.
2,NEX1,MALE,WILD,.,.,.,.,.,.,.,.,.,224,72,.,.,.,.,.
3,NEX2,MALE,WILD,.,.,.,.,.,.,.,.,.,220,74,.,.,.,.,.
4,NEX3,MALE,WILD,.,.,.,.,.,.,.,.,.,228,78,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,DO28,FEMALE,DOMESTIC,.,.,.,.,.,181,118,.,.,.,.,.,.,147,111,.
154,DO29,FEMALE,DOMESTIC,.,.,.,.,.,183,115,.,.,.,.,.,.,129,102,.
155,DO30,FEMALE,DOMESTIC,.,.,.,.,.,177,113,.,.,.,.,.,.,135,101,.
156,DO31,FEMALE,DOMESTIC,.,.,.,.,.,182,117,.,.,.,.,.,.,148,109,.


In [6]:
#turkey2: only male
turkey2 = turkey.loc[turkey['SEX'] == 'MALE',
    ['ID','TYPE','HUM','RAD','ULN','FEMUR','TIN','CAR','D3P','COR','SCA']]
turkey2 = turkey2.replace('.',np.nan).dropna().reset_index(drop=True)
turkey2

Unnamed: 0,ID,TYPE,HUM,RAD,ULN,FEMUR,TIN,CAR,D3P,COR,SCA
0,B710,WILD,153,140,147,142,151,817,305,102,128
1,B790,WILD,156,137,151,146,155,814,305,111,137
2,B819,WILD,158,135,151,146,152,790,289,111,125
3,B085,WILD,148,129,146,139,147,767,287,106,123
4,B089,WILD,157,140,154,140,159,818,301,116,136
5,B090,WILD,153,138,153,141,151,822,312,115,133
6,B091,WILD,156,138,156,145,150,835,310,118,133
7,B097,WILD,153,135,150,144,158,772,276,102,123
8,B099,WILD,152,140,151,144,158,792,303,111,122
9,B106,WILD,147,130,144,136,145,765,289,108,131


In [7]:
x = turkey2.drop(['ID','TYPE'], axis=1)
x = x.astype(int)
y = turkey2['TYPE']

In [8]:
#Apply function in code #1
coeff1, result1, ldf1 = lda(x,y,0.6,0.4) 

In [9]:
coeff1

Unnamed: 0,Coefficients
Constant,-14.570875
HUM,0.104816
RAD,0.079992
ULN,0.61948
FEMUR,0.207355
TIN,-0.912619
CAR,0.032513
D3P,0.103164
COR,-0.821637
SCA,0.012002


In [10]:
#(a) Which turkeys in this data set were misclassified by the discriminant rule when the rule was applied to the training data?
#(b) What are the posterior probabilities for both domestic and wild classifcations for those turkeys that were misclassified in (a)?
result1

Unnamed: 0,From TYPE,Classified into Type,DOMESTIC,WILD
0,WILD,DOMESTIC,0.662,0.338
1,WILD,WILD,0.0009,0.9991
2,WILD,WILD,0.0011,0.9989
3,WILD,WILD,0.0058,0.9942
4,WILD,WILD,0.0,1.0
5,WILD,WILD,0.0033,0.9967
6,WILD,WILD,0.0171,0.9829
7,WILD,WILD,0.0002,0.9998
8,WILD,WILD,0.0,1.0
9,WILD,WILD,0.0013,0.9987


In [11]:
#(c) Determine the value of each of the linear discriminant function for turkeys whose IDs are B710 and L674.
#How do you classify these two turkeys?
ldf1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
ldf,0.266544,-7.386383,-7.173841,-5.557687,-14.480549,-6.120478,-4.45412,-8.763461,-11.819885,-7.024255,...,4.986669,-1.428094,11.039951,9.077645,3.385169,5.415778,8.488731,6.988009,11.871528,4.585818


In [12]:
#LDF values for 'B710' and 'L674'
ldf1.iloc[0,0], ldf1.iloc[0,14]

(0.2665441199626031, 8.85941578165395)

In [13]:
#(d) Calculate the 'leave-one-out' accuracy of the LDA model.
acc = loo(x,y,0.6,0.4)
acc

0.8484848484848485