# Iris Flower Classification

Classify different species of the Iris flower

**Iris** is the family in the flower which contains the several species such as the setosa, versicolor, virginica, etc.


### Data Exploration

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn import neighbors
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np


In [2]:
# load dataset
iris_dataset = datasets.load_iris()

# convert to dataframe
df = pd.DataFrame(iris_dataset.data, columns=iris_dataset.feature_names)

# show the first 5 rows
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# number of sample and number of features
df.shape

(150, 4)

In [4]:
# add a new column target
df['target'] = pd.Series(iris_dataset.target)

# add a new column species
df['species'] = pd.Categorical.from_codes(iris_dataset.target, iris_dataset.target_names)

# show the first 5 rows
df.head(5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


Clean up code

In [5]:
def download_data():
    iris_dataset = datasets.load_iris()
    
    # x contains iris features
    # (Sepal length, Sepal width, Petal length, Petal width)
    X = iris_dataset.data
    
    # y contins the labels
    # (0 for Setosa, 1 for Versicolor, or 2 for Virginica)
    y = iris_dataset.target
    
    return X, y


In [6]:
X, y = download_data()


### Split up the dataset

In [7]:
train_test_split(X, y, test_size = 0.25)


[array([[5.8, 2.7, 3.9, 1.2],
        [5.7, 2.8, 4.1, 1.3],
        [6.1, 3. , 4.6, 1.4],
        [6.2, 2.9, 4.3, 1.3],
        [4.9, 2.5, 4.5, 1.7],
        [5.6, 3. , 4.5, 1.5],
        [6.7, 3.1, 4.7, 1.5],
        [4.8, 3.1, 1.6, 0.2],
        [6.5, 2.8, 4.6, 1.5],
        [5.7, 3.8, 1.7, 0.3],
        [6.7, 3.3, 5.7, 2.1],
        [5.6, 2.5, 3.9, 1.1],
        [5.6, 3. , 4.1, 1.3],
        [5.8, 2.7, 5.1, 1.9],
        [6.7, 3. , 5.2, 2.3],
        [5.3, 3.7, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [7.3, 2.9, 6.3, 1.8],
        [5.7, 2.8, 4.5, 1.3],
        [7.4, 2.8, 6.1, 1.9],
        [4.9, 3. , 1.4, 0.2],
        [7.2, 3. , 5.8, 1.6],
        [5.5, 2.6, 4.4, 1.2],
        [6.8, 2.8, 4.8, 1.4],
        [6.4, 3.1, 5.5, 1.8],
        [6. , 2.2, 4. , 1. ],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.5, 1.3, 0.3],
        [5.1, 3.5, 1.4, 0.3],
        [5.4, 3.9, 1.3, 0.4],
        [6.4, 2.9, 4.3, 1.3],
        [5. , 3.4, 1.5, 0.2],
        [5.1, 3.5, 1.4, 0.2],
        [7

- **x_train** contains the training features (Sepal length, Sepal width, Petal length, Petal width)
- **x_test** contains the testing features (Sepal length, Sepal width, Petal length, Petal width)

- **y_train** contains training labels (0 for Setosa, 1 for Versicolor, or 2 for Virginica)
- **y_test** contains the testing labels (0 for Setosa, 1 for Versicolor, or 2 for Virginica)

In [8]:
def split_data(X, y, test_size=0.25):
    return train_test_split(X, y, test_size = test_size)


In [9]:
X_train, X_test, y_train, y_test = split_data(X, y)
print('There are {} samples in the training set and {} samples in the test set'.format(X_train.shape[0], X_test.shape[0]))


There are 112 samples in the training set and 38 samples in the test set


### Build a Model using Classification Algorithm

1. Decision Tree
1. K Nearest Neighbors
1. Support Vector Machine

In [10]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [11]:
classifier = neighbors.KNeighborsClassifier()
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [12]:
classifier = svm.SVC()
classifier.fit(X_train, y_train)

SVC()

In [13]:
def build_model(model=""):
    X, y = download_data()
    X_train, X_test, y_train, y_test = split_data(X, y)

    if model == "knn":
        classifier = neighbors.KNeighborsClassifier()
    elif model == "svm":
        classifier = svm.SVC()
    else:
        model = "tree"
        classifier = tree.DecisionTreeClassifier()

    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy for {}: {}".format(model, accuracy))

    return classifier

In [14]:
model_tree = build_model("tree")

Accuracy for tree: 0.8947368421052632


In [15]:
model_knn = build_model("knn")

Accuracy for knn: 0.9736842105263158


In [16]:
model_svm = build_model("svm")

Accuracy for svm: 0.9736842105263158


### Test with our own data

Iris with following features:
- Sepal length of 1 cm
- Sepal width of 2 cm
- Petal length of 3 cm
- Petal width of 4 cm


In [17]:
X_new = np.array([[1, 2, 3, 4]])

prediction_tree = model_tree.predict(X_new)
print("Predicted Iris species using Decision Tree: {}".format(iris_dataset.target_names[prediction_tree]))

prediction_knn = model_knn.predict(X_new)
print("Predicted Iris species using K Nearest Neighbors: {}".format(iris_dataset.target_names[prediction_knn]))

prediction_svm = model_svm.predict(X_new)
print("Predicted Iris species using Support Vector Machine: {}".format(iris_dataset.target_names[prediction_svm]))


Predicted Iris species using Decision Tree: ['virginica']
Predicted Iris species using K Nearest Neighbors: ['versicolor']
Predicted Iris species using Support Vector Machine: ['virginica']
