In [16]:
# Importing the libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
import mglearn

In [2]:
# Loading the dataset
from sklearn.datasets import load_iris

iris = load_iris()

In [3]:
# Type and keys of the data loaded
print("Type of the data is {}".format(type(iris)))
print("Keys of the iris dataset {}".format(iris.keys()))

Type of the data is <class 'sklearn.utils.Bunch'>
Keys of the iris dataset dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [4]:
# Description of the dataset
print(iris['DESCR'] + "\n")

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [5]:
# Classes present in the data
print(iris['target_names'])

['setosa' 'versicolor' 'virginica']


In [6]:
# Features present in the data
print(iris['feature_names'])

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [7]:
# Data and Target are contained in "data" and "target" keys
print("Type of data {} \nType of target {}".format(type(iris.data), type(iris.target)))

Type of data <class 'numpy.ndarray'> 
Type of target <class 'numpy.ndarray'>


In [8]:
# Shape of the data 
print(iris.data.shape)
print("head of the data {}".format(iris.data[:5]))

(150, 4)
head of the data [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [9]:
# 0 - setosa, 1 - versicolor, 2 - virginica
print(iris.target.shape)

(150,)


In [10]:
# Splitting the datastet
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=42)

In [11]:
# Shapes of the splitted data
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_train shape: (112, 4)
y_train shape: (112,)
X_test shape: (38, 4)
y_test shape: (38,)


In [20]:
# Create dataframe with column names from the feature_names
iris_df = pd.DataFrame(X_train, columns=iris.feature_names)

# Create scatter matrix from the dataframe and color the samples by the labels from y_train
scatter = pd.plotting.scatter_matrix(iris_df, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins': 20}, s=60, alpha=0.8, cmap=mglearn.cm3)

<IPython.core.display.Javascript object>

In [19]:
# Classifier - k-Nearest Neighbors
# k = 1
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
# Fitting the classifier
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [21]:
# Creating a new set
X_new = np.array([[5,2.9,1,0.2]])
print('Shape {}'.format(X_new.shape))

Shape (1, 4)


In [23]:
# Predicting on the new set
pred = knn.predict(X_new)
print("Prediction {}".format(pred))
print("Name of the predicted flower {}".format(iris.target_names[pred]))

Prediction [0]
Name of the predicted flower ['setosa']


In [30]:
# Evaluating the model performance
y_pred = knn.predict(X_test)
print("Test Predictions {}".format(y_pred))
# Scoring the model - Accuracy
print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test Predictions [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
Test set score: 1.00
Test set score: 1.00


In [31]:
# Complete KNN in small snippet

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=42)

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 1.00
