In [22]:
## Frank Tranghese
## 10 AUG 2018
## Iris Classification Dataset

# Based on example found at https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
# The majority of my experience with machine learning is in MATLAB. Intent of this code is to learning more 
# about using python for machine learning. The intent isn't necessarily to go for the best methodology but
# to practice use of python for machine learning

import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [3]:
## Load iris data

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

In [5]:
## Inspect Data

# Check how many classes there are and how many examples of each
print(dataset.groupby('class').size())

# Description of data
print(dataset.describe())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64
       sepal-length  sepal-width  petal-length  petal-width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [6]:
## Create Train and Test sets

entireSet = dataset.values
features = entireSet[:,0:4] # get features from data
labels = entireSet[:,4] # get class labels from data
validSize = 0.1 # we want to test on 10% of the set, training using 90%

X_train,X_test,Y_train,Y_test = model_selection.train_test_split(features, labels, test_size=validSize, random_state=7)

In [10]:
## Test KNN models over different distance functions

models = []
models.append(('euclidKNN', KNeighborsClassifier(metric = 'euclidean')))
models.append(('chebyKNN', KNeighborsClassifier(metric = 'chebyshev')))
models.append(('manhatKNN', KNeighborsClassifier(metric = 'manhattan')))

results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=7)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

euclidKNN: 0.977473 (0.034441)
chebyKNN: 0.976923 (0.049255)
manhatKNN: 0.962637 (0.037411)


In [19]:
## Test KNN using different nearest neighbor sizes
# We can see that the Euclidean distance resulted in the highest accuracy. Now lets test this over different neighbors

models = []
models.append(('KNN3', KNeighborsClassifier(metric = 'euclidean', n_neighbors= 3)))
models.append(('KNN5', KNeighborsClassifier(metric = 'euclidean', n_neighbors= 5)))
models.append(('KNN7', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 7)))
models.append(('KNN9', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9)))
models.append(('KNN11', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 11)))
models.append(('KNN13', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 13)))
models.append(('KNN15', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 15)))
models.append(('KNN17', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 17)))
models.append(('KNN19', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 19)))

results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=7)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

KNN3: 0.962637 (0.049193)
KNN5: 0.977473 (0.034441)
KNN7: 0.970330 (0.048414)
KNN9: 0.977473 (0.034441)
KNN11: 0.970330 (0.036380)
KNN13: 0.978022 (0.033602)
KNN15: 0.978022 (0.033602)
KNN17: 0.985165 (0.029696)
KNN19: 0.985165 (0.029696)


In [20]:
# from above, we can see that using 17 neighbors yeilds the most accurate results, so we will use the 
# eucliean distance metric with 17 neighbors to try on our test set.

# Make predictions on test set
KNNClassifier = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 17)
KNNClassifier.fit(X_train, Y_train)
predictions = KNNClassifier.predict(X_test)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

0.8666666666666667
[[5 0 0]
 [0 6 1]
 [0 1 2]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.86      0.86      0.86         7
 Iris-virginica       0.67      0.67      0.67         3

    avg / total       0.87      0.87      0.87        15



In [30]:
# we can see above some confusion between class 2 and class 3. But overall we achieved 86.667% accuracy.
# further looking into what best separates class 2 and 3 would be a good next step. Adding an additional feature
# would improve our results