In [4]:
import numpy as np
import pandas as pd
# To import the dataset
from sklearn.datasets import load_iris
# To be used for splitting the dataset into training and test sets
from sklearn.model_selection import train_test_split
# To be used for min-max normalization
from sklearn.preprocessing import MinMaxScaler
# To be used for Z-normalization (standardization)
from sklearn.preprocessing import StandardScaler

# Load the iris dataset from Scikit-learn package
iris = load_iris()

# This prints a summary of the characteristics, statistics of the dataset
print(iris.DESCR)
 
# Divide the data into features (X) and target (Y)
# Data is converted to a panda’s dataframe
X = pd.DataFrame(iris.data)
 
# Separate the target attribute from rest of the data columns
Y = iris.target
 
# Take a look at the dataframe
X.head()
 
# This prints the shape of the dataframe (150 rows and 4 columns)
X.shape


.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

(150, 4)

In [5]:
# To divide the dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, Y ,test_size=0.2)

In [9]:
# Good practice to keep original dataframes untouched for reusability
X_train_n = X_train.copy()
X_test_n = X_test.copy()
 
# Fit min-max scaler on training data
norm = MinMaxScaler().fit(X_train_n)
 
new = MinMaxScaler().fit_transform(X_train_n) 
# Transform the training data
X_train_norm = norm.transform(X_train_n)
 
# Use the same scaler to transform the testing set
X_test_norm = norm.transform(X_test_n)
new

array([[0.33333333, 0.125     , 0.50847458, 0.5       ],
       [0.86111111, 0.33333333, 0.86440678, 0.75      ],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.19444444, 0.54166667, 0.06779661, 0.04166667],
       [0.66666667, 0.41666667, 0.6779661 , 0.66666667],
       [0.22222222, 0.625     , 0.06779661, 0.08333333],
       [1.        , 0.75      , 0.91525424, 0.79166667],
       [0.55555556, 0.54166667, 0.62711864, 0.625     ],
       [0.38888889, 0.33333333, 0.59322034, 0.5       ],
       [0.41666667, 0.29166667, 0.49152542, 0.45833333],
       [0.69444444, 0.41666667, 0.76271186, 0.83333333],
       [0.36111111, 0.33333333, 0.66101695, 0.79166667],
       [0.36111111, 0.41666667, 0.59322034, 0.58333333],
       [0.19444444, 0.        , 0.42372881, 0.375     ],
       [0.58333333, 0.5       , 0.59322034, 0.58333333],
       [0.38888889, 0.25      , 0.42372881, 0.375     ],
       [0.41666667, 0.25      , 0.50847458, 0.45833333],
       [0.47222222, 0.375     ,

In [8]:
X_train_norm_df = pd.DataFrame(X_train_norm)
 
     	# Assigning original feature names for ease of read
X_train_norm_df.columns = iris.feature_names
 
X_train_norm_df.describe()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,120.0,120.0,120.0,120.0
mean,0.4375,0.448611,0.474435,0.463889
std,0.235269,0.180915,0.303064,0.322339
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.576271,0.5
75%,0.611111,0.583333,0.699153,0.71875
max,1.0,1.0,1.0,1.0


In [12]:
pd.DataFrame(StandardScaler().fit_transform(X_train_n),columns=iris.feature_names).describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,120.0,120.0,120.0,120.0
mean,3.3306690000000003e-17,2.9605950000000004e-17,1.813364e-16,-3.552714e-16
std,1.004193,1.004193,1.004193,1.004193
min,-1.867374,-2.490073,-1.572024,-1.44517
25%,-0.9188665,-0.639864,-1.235062,-1.185559
50%,-0.08892256,-0.1773117,0.3374306,0.1124983
75%,0.7410213,0.7477928,0.7445938,0.7939781
max,2.400909,3.060554,1.741441,1.670166
