In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [3]:
df=pd.read_csv(url, names=['sepal_length','sepal_width','petal_length','petal_width','target'])

In [4]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df.columns.tolist()

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
target          150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [7]:
target=df.loc[:, 'target']

In [8]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [9]:
# Separating out the features. Returns an array of values for the columns
features= df.loc[:, features].values

In [10]:
# Separating out the target
targets = df.loc[:,['target']].values

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler=StandardScaler()

In [13]:
x=scaler.fit_transform(features)

In [14]:
df1=pd.DataFrame(x, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

In [15]:
data=df1.join(target).head()

In [16]:
compared_data=df1.join(target).head(), df.head()
compared_data

(   sepal_length  sepal_width  petal_length  petal_width       target
 0     -0.900681     1.032057     -1.341272    -1.312977  Iris-setosa
 1     -1.143017    -0.124958     -1.341272    -1.312977  Iris-setosa
 2     -1.385353     0.337848     -1.398138    -1.312977  Iris-setosa
 3     -1.506521     0.106445     -1.284407    -1.312977  Iris-setosa
 4     -1.021849     1.263460     -1.341272    -1.312977  Iris-setosa,
    sepal_length  sepal_width  petal_length  petal_width       target
 0           5.1          3.5           1.4          0.2  Iris-setosa
 1           4.9          3.0           1.4          0.2  Iris-setosa
 2           4.7          3.2           1.3          0.2  Iris-setosa
 3           4.6          3.1           1.5          0.2  Iris-setosa
 4           5.0          3.6           1.4          0.2  Iris-setosa)

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca=PCA(n_components=2)

In [19]:
principalComponents=pca.fit_transform(x) #if fit alone is used, pd dataframe won't be properly constructed

In [20]:
principalDf = pd.DataFrame(principalComponents
             , columns = ['component1', 'component2'])

In [21]:
principalDf.join(df).head()

Unnamed: 0,component1,component2,sepal_length,sepal_width,petal_length,petal_width,target
0,-2.264542,0.505704,5.1,3.5,1.4,0.2,Iris-setosa
1,-2.086426,-0.655405,4.9,3.0,1.4,0.2,Iris-setosa
2,-2.36795,-0.318477,4.7,3.2,1.3,0.2,Iris-setosa
3,-2.304197,-0.575368,4.6,3.1,1.5,0.2,Iris-setosa
4,-2.388777,0.674767,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)

In [23]:
finalDf.head()

Unnamed: 0,component1,component2,target
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.36795,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa


#### Visualize 2D Projection

In [34]:
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
plot=plt.figure(figsize=(8,8))

<Figure size 576x576 with 0 Axes>

In [36]:
ax=plot.add_subplot(1,1,1)

In [37]:
ax.set_xlabel('component1', fontsize = 15)
ax.set_ylabel('component2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

Text(0.5, 1.0, '2 component PCA')

In [38]:
targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [39]:
colors = ['r', 'g', 'b']

In [40]:
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'component1']
               , finalDf.loc[indicesToKeep, 'component2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [41]:
plt.draw()

<Figure size 432x288 with 0 Axes>

In [42]:
plt.show()

In [33]:
import pylab as p
p.show()

In [49]:
x=(pca.explained_variance_ratio_)*100 #First component explains 72% of the variance
x

array([72.77045209, 23.03052327])

PCA to Speed-up Machine Learning Algorithms
One of the most important applications of PCA is for speeding up machine learning algorithms. Using the IRIS dataset would be impractical here as the dataset only has 150 rows and only 4 feature columns. The MNIST database of handwritten digits is more suitable as it has 784 feature columns (784 dimensions), a training set of 60,000 examples, and a test set of 10,000 examples.

In [82]:
from sklearn.datasets import fetch_openml

In [83]:
mnist = fetch_openml('mnist_784')

In [97]:
mnist.data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [98]:
mnist.target[0]

'5'

In [99]:
mnist.feature_names[0]

'pixel1'