# PCA-1 - Intro

Dimension reduction of data

## Step 1: Load the Data

In [None]:
import os
import urllib.request

data_location = "../data/cars/mtcars_header.csv"
data_url = 'https://elephantscale-public.s3.amazonaws.com/data/cars/mtcars_header.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

In [None]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

dataset = pd.read_csv(data_location)
print ('dataset.shape : ', dataset.shape)
dataset.sample(10)
# dataset

## Step 2 - Explore Data

As we can see, this data has 12 columns.  We want to plot this data in 2D (X and Y axis).  So we need to reduce the dimensions to 2

In [None]:
dataset2 = dataset.drop(['model'], axis=1)
dataset2

## Step 3 - Scale Data

We need to scale data before PCA

There are 2 options
1. Using Standard scaler
2. Using built-in pandas functions

In [None]:
## Option -1 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler ()

scaled = scaler.fit_transform(dataset2)
scaled_data = pd.DataFrame(scaled, columns = dataset2.columns)
scaled_data.head(10)

In [None]:
## Option 2 

scaled_data = (dataset2 - dataset2.mean()) / dataset2.std()
scaled_data.head(10)

## Step 4 - PCA

In [None]:
from sklearn.decomposition import PCA

# we are doing 2 PC
pca = PCA(n_components = 2)
pca_data = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(pca_data, columns = ['pc1', 'pc2'])
pca_df.head(10)

## Step 5 - Plot

Now let's plot the reduced data

In [None]:
import matplotlib.pyplot as plt

plt.scatter(pca_df['pc1'], pca_df['pc2'], marker='o')
plt.show()

## Step 6 - Compare Correlation Matrices

Now let's compare correlation matrix of original data and PCA data.

TODO: Can you explain the difference?

In [None]:
# original data
scaled_data.corr()

In [None]:
# pca
pca_df.corr()

## Step 7 - Understanding PCA

[SKLearn PCA reference](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html?highlight=pca#sklearn.decomposition.PCA)

* `pca.components_`  : Principal axes in feature space, representing the directions of maximum variance in the data. The components are sorted by explained_variance_
* `pca.explained_variance_ratio_` : Percentage of variance explained by each of the selected components.


In [None]:
print ('\n number of columns in original data : ', scaled_data.shape[1])
print ('\n each pca component length : ',  len(pca.components_[0]))
print ('\n pca.components_ :\n',  pca.components_)
print ('\n pca.explained_variance_ratio_ : ' , pca.explained_variance_ratio_)

## Step 8 - Find 3 Principal Components 

In [None]:
## TODO - you code goes here
## Adjust your code from step-4