In [3]:
# Class: COEN 140 Lab Machine Learning/Data Mining
# Name: Matthew Davenport
# Date: 9/25/2022
# Title: Lab 3 – Iris classifier
# Description: This program will perform clssifications
#           on iris datasets based on the sepal
#           lengths/widths and petal lengths/widths and 
#           classify them as Setosa, Versicolour, or Virginica



ROWS = 150
COLUMNS = 5
CLASSIFICATIONS = 3
FEATURES = ['sepal length','sepal width', 'pedal length', 'pedal width']


import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors

# for testing of proper reading of data (provided)
def test_dataset(data):
    if len(data) != 150:
        return False
    for row in data:
        if len(row) != 5:
            return False
        for column in row[:-1]:
            if type(column) != np.float64:
                return False
            
        if type(row[-1]) != str:
            return False
    
    return True

# read the file and convert the contents to the proper types and return list of lists
def read_file(): 
    data = [[] for x in range(ROWS)]                 # initialize empty list of lists length 150
    file = open('C:/Users/daven/COEN140/Lab3/test_dataset.txt', 'r')            # open file as read
    contents = [line.split() for line in file]      # put contents of file into list of lists of characters
    
    #iterate through contents stopping at each , in the list of characters to convert the first 4 elements into 
    #           np.float64 nd the last into a string to be loaded into data
    for row in range(len(contents)):                # iterate through contents
        i = 0
        for s in contents[row][0].split(","):       # usingn , as delimiter
            if i == (COLUMNS - 1):                              # at the last element just append as string
                data[row].append(s)
                break
            i += 1
            data[row].append(np.float64(s))         # append into data[row] the number converted to np.float64
    return data

# test the actual classifications compared to our predictions
# returns the percent error of predictions
def classification_compare(predicted, actuals):
    length = len(actuals)
    correct_predictions = 0
    for i in range(length):
        if predicted[i] == actuals[i]:
            correct_predictions += 1
        else:
            print("\nIncorrect prediction at index " + str(i))
            print("Predicted: " + predicted[i])
            print("Actual: ", actuals[i])
        
    return 100 - correct_predictions/length * 100
    
            
    

     
    





###################################################### TESTING ############################################################
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

# Testing contentes of file converted successfully 
data = read_file()
print(test_dataset(data))

# split into training and testing lists 
training = data[:40:] + data[50:90:] + data[100:140:]
testing = data[40:50:] + data[90:100:] + data[140:150:]

# ensure proper length for training and testing items
print(len(training))
print(len(testing))

# using np array to better manipulate columns of data
training_array = np.array(training)
testing_array = np.array(testing)

# print(training_array)
# separation of X and y and testing/training
# convert X from strings to floats using numpy
x_training = training_array[:,0:4].astype(np.float64)
x_testing = testing_array[:,0:4].astype(np.float64)
y_testing = testing_array[:,4]
y_training = training_array[:,4]

# print(x_testing)
# print(y_testing)

# lda classifiers
lda = LDA()
lda_ytraining = lda.fit(x_training, y_training).predict(x_training)
lda_ytesting = lda.fit(x_training, y_training).predict(x_testing)

# qda classifiers 
qda = QDA()
qda_ytraining = qda.fit(x_training, y_training).predict(x_training)
qda_ytesting = qda.fit(x_training, y_training).predict(x_testing)

# lda and qda training and testing error
# testing error should be 0%
print("LDA training and testing errors: ")
lda_training_error = classification_compare(lda_ytraining, y_training)
lda_testing_error = classification_compare(lda_ytesting, y_testing)
print("\nTraining error: " + str(lda_training_error) + "%, Testing error: " + str(lda_testing_error) + "%\n")
print("QDA training and testing errors: ")
qda_training_error = classification_compare(qda_ytraining, y_training)
qda_testing_error = classification_compare(qda_ytesting, y_testing)
print("\nTraining error: " + str(qda_training_error) + "%, Testing error: " + str(qda_testing_error) + "%\n\n")

# now testing data with 1 feature missing from the dataset. The result with the highest percent
#   error will indicate the feature that is most important and vice versa for the result with
#   lowest percent error
lda_errors = []
qda_errors = []
for i in range(0,4):
    x_training_importance = np.delete(x_training, i, axis=1)
    x_testing_importance = np.delete(x_testing, i, axis=1)
    # lda classifiers
    lda = LDA()
    lda_ytraining = lda.fit(x_training_importance, y_training).predict(x_training_importance)
    lda_ytesting = lda.fit(x_training_importance, y_training).predict(x_testing_importance)

    # qda classifiers 
    qda = QDA()
    qda_ytraining = qda.fit(x_training_importance, y_training).predict(x_training_importance)
    qda_ytesting = qda.fit(x_training_importance, y_training).predict(x_testing_importance)

    # lda and qda training and testing error
    # testing error should be 0%
    lda_training_error = classification_compare(lda_ytraining, y_training)
    lda_testing_error = classification_compare(lda_ytesting, y_testing)
    qda_training_error = classification_compare(qda_ytraining, y_training)
    qda_testing_error = classification_compare(qda_ytesting, y_testing)
    
    lda_errors.append(lda_training_error)
    qda_errors.append(qda_training_error)
    print(lda_errors)
    print(qda_errors)

# find max and min errors for lda and qda
lda_max_error = np.max(lda_errors)
lda_max_error_index = np.argmax(lda_errors)
lda_min_error = np.min(lda_errors)
lda_min_error_index = np.argmin(lda_errors)
qda_max_error = np.max(qda_errors)
qda_max_error_index = np.argmax(qda_errors)
qda_min_error = np.min(qda_errors)
qda_min_error_index = np.argmin(qda_errors)

print("\n\nLDA: ")
print("Most important feature was " + FEATURES[lda_max_error_index] + " with a percent error of " + str(lda_max_error) + "%")
print("Least important feature was " + FEATURES[lda_min_error_index] + " with a percent error of " + str(lda_min_error) + "%")
print("\nQDA: ")
print("Most important feature was " + FEATURES[qda_max_error_index] + " with a percent error of " + str(qda_max_error) + "%")
print("Least important feature was " + FEATURES[qda_min_error_index] + " with a percent error of " + str(qda_min_error) + "%")
   
    
# print(np.delete(x_training, 0, axis=1)




True
120
30
LDA training and testing errors: 

Incorrect prediction at index 60
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 73
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 113
Predicted: Iris-versicolor
Actual:  Iris-virginica

Training error: 2.5%, Testing error: 0.0%

QDA training and testing errors: 

Incorrect prediction at index 60
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 73
Predicted: Iris-virginica
Actual:  Iris-versicolor

Training error: 1.6666666666666714%, Testing error: 0.0%



Incorrect prediction at index 73
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 113
Predicted: Iris-versicolor
Actual:  Iris-virginica

Incorrect prediction at index 58
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 73
Predicted: Iris-virginica
Actual:  Iris-versicolor

Incorrect prediction at index 113
Predicted: Iri

In [1]:
print(lda_ytraining)

NameError: name 'lda_ytraining' is not defined