# LDA & QDA Classifier

In [1]:
import numpy as np
import math

### Read in data and format as floats

In [2]:
with open('iris.data', "r") as data:
    data = data.read().splitlines()
data = np.array([tuple(line.split(',')) for line in data])

nums = np.zeros((150, 4))

for row in range (150):
    for col in range (4):
        nums[row,col] = float(data[row, col])

nums = nums.T

### Split data into categories and split again into training and test data

In [3]:
setosaTrain = nums[0:4, 0:40]
setosaTest = nums[0:4, 40:50]

versiTrain = nums[0:4, 50:90]
versiTest = nums[0:4, 90:100]

virgTrain = nums[0:4, 100:140]
virgTest = nums[0:4, 140:150]

### Calculate u and E for each category and avgE for LDA

In [4]:
# Calculate sum 
u1 = np.array([np.sum(setosaTrain, axis = 1)]).T / len(setosaTrain[0])
u2 = np.array([np.sum(versiTrain, axis = 1)]).T / len(versiTrain[0])
u3 = np.array([np.sum(virgTrain, axis = 1)]).T / len(virgTrain[0])

# Calculate E        
E1 = np.matmul((setosaTrain - u1), (setosaTrain - u1).T) / len(setosaTrain[0])
E2 = np.matmul((versiTrain - u2), (versiTrain - u2).T) / len(versiTrain[0])
E3 = np.matmul((virgTrain - u3), (virgTrain - u3).T) / len(virgTrain[0])

#Calculate average E
avgE = (E1 + E2 + E3)/3

## Gaussian function

In [5]:
# Create a Gaussian function
def gaussian (E, test, u, i):
    "This returns the probability of the test parameters in a category"
    return (1 / (2 * math.pi * np.linalg.det(E))**(1 / 2)) * math.exp((-1/2) * np.matmul(np.matmul(test[0:4, i] - u.T, np.linalg.inv(E)), (test[0:4, i] - u.T).T))

## Classification function

In [6]:
# Create a classification function that calculates probabilities of the test data and returns errors
def classify_all_errors (E1, E2, E3, test1, test2, test3, u1, u2, u3):
    "This returns the number of errors for the entire test set"
    errorCount = 0

    # Run setosaTest and check if correctly classifies as Setosa
    for i in range (len(test1[0])):
        prob1 = gaussian (E1, test1, u1, i)
        prob2 = gaussian (E2, test1, u2, i)
        prob3 = gaussian (E3, test1, u3, i)
        if max (prob1, prob2, prob3) != prob1:
            errorCount += 1

    # Run versiTest and check if correctly classifies as Versicolor  
    for i in range (len(test2[0])):
        prob1 = gaussian (E1, test2, u1, i)
        prob2 = gaussian (E2, test2, u2, i)
        prob3 = gaussian (E3, test2, u3, i)
        if max (prob1, prob2, prob3) != prob2:
            errorCount += 1

    # Run virgTest and check if correctly classifies as Virginica
    for i in range (len(test3[0])):
        prob1 = gaussian (E1, test3, u1, i)
        prob2 = gaussian (E2, test3, u2, i)
        prob3 = gaussian (E3, test3, u3, i)
        if max (prob1, prob2, prob3) != prob3:
            errorCount += 1
    
    return errorCount

## LDA Classifier

### Enter test data into Gaussian and calculate training accuracy and error of LDA classifier 

In [7]:
error = classify_all_errors (avgE, avgE, avgE, setosaTrain, versiTrain, virgTrain, u1, u2, u3)/30
accuracy = 1 - error
print ("LDA Classifier (Train) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

LDA Classifier (Train) - Accuracy: 90.00 %, Error: 10.00 %


### Enter test data into Gaussian and calculate testing accuracy and error of LDA classifier

In [8]:
error = classify_all_errors (avgE, avgE, avgE, setosaTest, versiTest, virgTest, u1, u2, u3)/30
accuracy = 1 - error
print ("LDA Classifier (Test) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

LDA Classifier (Test) - Accuracy: 100.00 %, Error: 0.00 %


## QDA Classifier

### Enter test data into Gaussian and calculate training accuracy and error of QDA classifier 

In [9]:
error = classify_all_errors (E1, E2, E3, setosaTrain, versiTrain, virgTrain, u1, u2, u3)/30
accuracy = 1 - error
print ("QDA Classifier (Train) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

QDA Classifier (Train) - Accuracy: 93.33 %, Error: 6.67 %


### Enter test data into Gaussian and calculate testing accuracy and error of QDA classifier

In [10]:
error = classify_all_errors (E1, E2, E3, setosaTest, versiTest, virgTest, u1, u2, u3)/30
accuracy = 1 - error
print ("QDA Classifier (Test) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

QDA Classifier (Test) - Accuracy: 100.00 %, Error: 0.00 %


# Testing if any variables are unimportant

## Removing sepal length

In [11]:
setosaTrain_noSL = setosaTrain[1:4]
setosaTest_noSL = setosaTest[1:4]

versiTrain_noSL = versiTrain[1:4]
versiTest_noSL = versiTest[1:4]

virgTrain_noSL = virgTrain[1:4]
virgTest_noSL = virgTest[1:4]

In [12]:
# Calculate sum 
u1_noSL = np.array([np.sum(setosaTrain_noSL, axis = 1)]).T/40
u2_noSL = np.array([np.sum(versiTrain_noSL, axis = 1)]).T/40
u3_noSL = np.array([np.sum(virgTrain_noSL, axis = 1)]).T/40

# Calculate E        
E1_noSL = np.matmul((setosaTrain_noSL - u1_noSL), (setosaTrain_noSL - u1_noSL).T) / 40
E2_noSL = np.matmul((versiTrain_noSL - u2_noSL), (versiTrain_noSL - u2_noSL).T) / 40
E3_noSL = np.matmul((virgTrain_noSL - u3_noSL), (virgTrain_noSL - u3_noSL).T) / 40

#Calculate average E
avgE_noSL = (E1_noSL + E2_noSL + E3_noSL)/3

### LDA without sepal length

In [13]:
error = classify_all_errors (avgE_noSL, avgE_noSL, avgE_noSL, setosaTest_noSL, versiTest_noSL, virgTest_noSL, u1_noSL, u2_noSL, u3_noSL)/30
accuracy = 1 - error
print ("No Sepal Length (LDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Sepal Length (LDA) - Accuracy: 100.00 %, Error: 0.00 %


### QDA without sepal length

In [14]:
error = classify_all_errors (E1_noSL, E2_noSL, E3_noSL, setosaTest_noSL, versiTest_noSL, virgTest_noSL, u1_noSL, u2_noSL, u3_noSL)/30
accuracy = 1 - error
print ("No Sepal Length (QDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Sepal Length (QDA) - Accuracy: 100.00 %, Error: 0.00 %


## Removing sepal width

In [15]:
setosaTrain_noSW = np.concatenate(([setosaTrain[0]], setosaTrain[2:4]), axis = 0)
setosaTest_noSW = np.concatenate(([setosaTest[0]], setosaTest[2:4]), axis = 0)

versiTrain_noSW = np.concatenate(([versiTrain[0]], versiTrain[2:4]), axis = 0)
versiTest_noSW = np.concatenate(([versiTest[0]], versiTest[2:4]), axis = 0)

virgTrain_noSW = np.concatenate(([virgTrain[0]], virgTrain[2:4]), axis = 0)
virgTest_noSW = np.concatenate(([virgTest[0]], virgTest[2:4]), axis = 0)

In [16]:
# Calculate sum 
u1_noSW = np.array([np.sum(setosaTrain_noSW, axis = 1)]).T/40
u2_noSW = np.array([np.sum(versiTrain_noSW, axis = 1)]).T/40
u3_noSW = np.array([np.sum(virgTrain_noSW, axis = 1)]).T/40

# Calculate E        
E1_noSW = np.matmul((setosaTrain_noSW - u1_noSW), (setosaTrain_noSW - u1_noSW).T) / 40
E2_noSW = np.matmul((versiTrain_noSW - u2_noSW), (versiTrain_noSW - u2_noSW).T) / 40
E3_noSW = np.matmul((virgTrain_noSW - u3_noSW), (virgTrain_noSW - u3_noSW).T) / 40

#Calculate average E
avgE_noSW = (E1_noSW + E2_noSW + E3_noSW)/3

### LDA without sepal width

In [17]:
error = classify_all_errors (avgE_noSW, avgE_noSW, avgE_noSW, setosaTest_noSW, versiTest_noSW, virgTest_noSW, u1_noSW, u2_noSW, u3_noSW)/30
accuracy = 1 - error
print ("No Sepal Width (LDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Sepal Width (LDA) - Accuracy: 100.00 %, Error: 0.00 %


### QDA without sepal width

In [18]:
error = classify_all_errors (E1_noSW, E2_noSW, E3_noSW, setosaTest_noSW, versiTest_noSW, virgTest_noSW, u1_noSW, u2_noSW, u3_noSW)/30
accuracy = 1 - error
print ("No Sepal Width (QDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Sepal Width (QDA) - Accuracy: 100.00 %, Error: 0.00 %


## Removing petal length

In [19]:
setosaTrain_noPL = np.concatenate((setosaTrain[0:2], [setosaTrain[3]]), axis = 0)
setosaTest_noPL = np.concatenate((setosaTest[0:2], [setosaTest[3]]), axis = 0)

versiTrain_noPL = np.concatenate((versiTrain[0:2], [versiTrain[3]]), axis = 0)
versiTest_noPL = np.concatenate((versiTest[0:2], [versiTest[3]]), axis = 0)

virgTrain_noPL = np.concatenate((virgTrain[0:2], [virgTrain[3]]), axis = 0)
virgTest_noPL = np.concatenate((virgTest[0:2], [virgTest[3]]), axis = 0)

In [20]:
# Calculate sum 
u1_noPL = np.array([np.sum(setosaTrain_noPL, axis = 1)]).T/40
u2_noPL = np.array([np.sum(versiTrain_noPL, axis = 1)]).T/40
u3_noPL = np.array([np.sum(virgTrain_noPL, axis = 1)]).T/40

# Calculate E        
E1_noPL = np.matmul((setosaTrain_noPL - u1_noPL), (setosaTrain_noPL - u1_noPL).T) / 40
E2_noPL = np.matmul((versiTrain_noPL - u2_noPL), (versiTrain_noPL - u2_noPL).T) / 40
E3_noPL = np.matmul((virgTrain_noPL - u3_noPL), (virgTrain_noPL - u3_noPL).T) / 40

#Calculate average E
avgE_noPL = (E1_noPL + E2_noPL + E3_noPL)/3

### LDA without petal length

In [21]:
error = classify_all_errors (avgE_noPL, avgE_noPL, avgE_noPL, setosaTest_noPL, versiTest_noPL, virgTest_noPL, u1_noPL, u2_noPL, u3_noPL)/30
accuracy = 1 - error
print ("No Petal Length (LDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Petal Length (LDA) - Accuracy: 100.00 %, Error: 0.00 %


### QDA without petal length

In [22]:
error = classify_all_errors (E1_noPL, E2_noPL, E3_noPL, setosaTest_noPL, versiTest_noPL, virgTest_noPL, u1_noPL, u2_noPL, u3_noPL)/30
accuracy = 1 - error
print ("No Petal Length (QDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

No Petal Length (QDA) - Accuracy: 100.00 %, Error: 0.00 %


## Removing petal width

In [23]:
setosaTrain_noPW = setosaTrain[0:3]
setosaTest_noPW = setosaTest[0:3]

versiTrain_noPW = versiTrain[0:3]
versiTest_noPW = versiTest[0:3]

virgTrain_noPW = virgTrain[0:3]
virgTest_noPW = virgTest[0:3]

In [24]:
# Calculate sum 
u1_noPW = np.array([np.sum(setosaTrain_noPW, axis = 1)]).T/40
u2_noPW = np.array([np.sum(versiTrain_noPW, axis = 1)]).T/40
u3_noPW = np.array([np.sum(virgTrain_noPW, axis = 1)]).T/40

# Calculate E        
E1_noPW = np.matmul((setosaTrain_noPW - u1_noPW), (setosaTrain_noPW - u1_noPW).T) / 40
E2_noPW = np.matmul((versiTrain_noPW - u2_noPW), (versiTrain_noPW - u2_noPW).T) / 40
E3_noPW = np.matmul((virgTrain_noPW - u3_noPW), (virgTrain_noPW - u3_noPW).T) / 40

#Calculate average E
avgE_noPW = (E1_noPW + E2_noPW + E3_noPW)/3

### LDA without petal width

In [25]:
error = classify_all_errors (avgE_noPW, avgE_noPW, avgE_noPW, setosaTest_noPW, versiTest_noPW, virgTest_noPW, u1_noPW, u2_noPW, u3_noPW)/30
accuracy = 1 - error
print ("Petal Width (LDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

Petal Width (LDA) - Accuracy: 96.67 %, Error: 3.33 %


### QDA without petal width

In [26]:
error = classify_all_errors (E1_noPW, E2_noPW, E3_noPW, setosaTest_noPW, versiTest_noPW, virgTest_noPW, u1_noPW, u2_noPW, u3_noPW)/30
accuracy = 1 - error
print ("Petal Width (QDA) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

Petal Width (QDA) - Accuracy: 96.67 %, Error: 3.33 %


# Independent features

In [27]:
# Create diagonal matrices
E1_ind = np.zeros((4, 4))
E2_ind = np.zeros((4, 4))
E3_ind = np.zeros((4, 4))

# Insert diagonal values
for i in range(4):
    E1_ind[i, i] = E1[i, i]
    E2_ind[i, i] = E2[i, i]
    E3_ind[i, i] = E3[i, i]

#Calculate average E   
avgE_ind = (E1_ind + E2_ind + E3_ind) / 3

## LDA (Independent)

In [28]:
error = classify_all_errors (avgE_ind, avgE_ind, avgE_ind, setosaTest, versiTest, virgTest, u1, u2, u3)/30
accuracy = 1 - error
print ("LDA Classifer (Independent) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

LDA Classifer (Independent) - Accuracy: 100.00 %, Error: 0.00 %


## QDA (Independent)

In [29]:
error = classify_all_errors (E1_ind, E2_ind, E3_ind, setosaTest, versiTest, virgTest, u1, u2, u3)/30
accuracy = 1 - error
print ("QDA Classifier (Independent) - Accuracy:", format(accuracy*100, '.2f'), "%, Error:", format(error*100, '.2f'), "%")

QDA Classifier (Independent) - Accuracy: 100.00 %, Error: 0.00 %
