In [46]:
#importing the iris dataset and helpful packages 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Exploring the data

In [47]:
#loading the dataset for the iris classification problem
iris_dataset = load_iris()
print("Target names: {}".format(iris_dataset['target_names']))  #three class, possible outputs
print("Feature names: {}".format(iris_dataset['feature_names']))  
print("Type of data: {}".format(type(iris_dataset['data'])))

Target names: ['setosa' 'versicolor' 'virginica']
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Type of data: <class 'numpy.ndarray'>


In [48]:
#We want to predict the species of iris and want to learn how many samples in the dataset. 
print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [49]:
print("Type of target: {}".format(type(iris_dataset['target'])))
#The target array contains the species of each of the flowers

print("Shape of target: {}".format(iris_dataset['target'].shape))
print("Target:\n{}".format(iris_dataset['target']))

Type of target: <class 'numpy.ndarray'>
Shape of target: (150,)
Target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


it seems that our three class represented by those numbers 0,1,2;
setosa (0), versicolor (1), virginica(2).

# Models 

In [50]:
#model select
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

#metrices
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score

In [51]:
#Train and test split, choosing my test size 30% of the data. 
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], 
                                                    iris_dataset['target'],
                                                    test_size=0.3, random_state=0)


print('There are {} samples in the training set and {} samples in the test set'.format(
X_train.shape[0], X_test.shape[0]))


There are 105 samples in the training set and 45 samples in the test set


This function extracts 70% of the rows in the data as the training set with the corresponding labels. 

The remaining 30% of the data with the remaining labels will be used as the test set.

In [52]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (105, 4)
y_train shape: (105,)


In [53]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (45, 4)
y_test shape: (45,)


# K Nearest Neighbor:
KNN algorithm assumes the similarity between the new case/data and available cases and 
put the new case into the category that is most similar to the available categories.

In [54]:
knn = KNeighborsClassifier(n_neighbors = 3) #using different values of n effects the accuracy for the KNN models 
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test) 
accuracy_knn=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')

print('Confusion matrix for KNN\n',cm)
print('accuracy_KNN : %.3f' %accuracy)
print('precision_KNN : %.3f' %precision)

Confusion matrix for KNN
 [[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
accuracy_KNN : 0.978
precision_KNN : 0.978


In [55]:
classes = ['setosa (0)', 'versicolor (1)' ,'virginica (2)']
cm_df = pd.DataFrame(cm, columns=classes, index=classes)
cm_df

Unnamed: 0,setosa (0),versicolor (1),virginica (2)
setosa (0),16,0,0
versicolor (1),0,17,1
virginica (2),0,0,11


# Gaussian Naive Bayes:
Naive Bayes is a classification algorithm for binary (two-class) and multi-class classification problems. The technique is easiest to understand when described using binary or categorical input values.

In [56]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test) 
accuracy_nb=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')

print('Confusion matrix for Naive Bayes\n',cm)
print('accuracy_Naive Bayes: %.3f' %accuracy)
print('precision_Naive Bayes: %.3f' %precision)

Confusion matrix for Naive Bayes
 [[16  0  0]
 [ 0 18  0]
 [ 0  0 11]]
accuracy_Naive Bayes: 1.000
precision_Naive Bayes: 1.000


In [57]:
classes = ['setosa (0)', 'versicolor (1)' ,'virginica (2)']
cm_df = pd.DataFrame(cm, columns=classes, index=classes)
cm_df

Unnamed: 0,setosa (0),versicolor (1),virginica (2)
setosa (0),16,0,0
versicolor (1),0,18,0
virginica (2),0,0,11


# Linear Support Vector Machine:
Support Vector Machine” (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. However, it is mostly used in classification problems. In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiates the two classes very well

In [40]:
linear_svc = LinearSVC(max_iter=4000)
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
accuracy_svc=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision = precision_score(y_test, Y_pred,average='micro')

print('Confusion matrix for SVC\n',cm)
print('accuracy_SVC: %.3f' %accuracy)
print('precision_SVC: %.3f' %precision)

Confusion matrix for SVC
 [[16  0  0]
 [ 0 15  3]
 [ 0  0 11]]
accuracy_SVC: 0.933
precision_SVC: 0.933


In [41]:
classes = ['setosa (0)', 'versicolor (1)' ,'virginica (2)']     
cm_df = pd.DataFrame(cm, columns=classes, index=classes)
cm_df

Unnamed: 0,setosa (0),versicolor (1),virginica (2)
setosa (0),16,0,0
versicolor (1),0,15,3
virginica (2),0,0,11


# Comparing models 
which is the best model?


In [63]:
results = pd.DataFrame({
    'Model': [ 'KNN', 
              'Naive Bayes',  
              ' Support Vector Machine'],
    'Score': [ acc_knn,
              acc_gaussian,  
              acc_linear_svc],
    "Accuracy_score":[accuracy_knn,
                      accuracy_nb,
                      accuracy_svc,]})

result_df = results.sort_values(by='Accuracy_score', ascending=False)
result_df = result_df.reset_index(drop=True)
result_df

Unnamed: 0,Model,Score,Accuracy_score
0,Naive Bayes,94.29,100.0
1,KNN,96.19,97.78
2,Support Vector Machine,98.1,93.33
