# Chapter 16: Demo PCA 

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
mnist = fetch_openml('mnist_784', version=1, cache=True ) # image 28x28 => 784 [[..., ..., ...]]

In [None]:
# mnist

In [None]:
mnist.data.shape

In [None]:
mnist.target.shape

In [None]:
# test_size: what proportion of original data is used for test set
train_img, test_img, train_lbl, test_lbl = train_test_split(
    mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [None]:
print(train_img.shape)

In [None]:
print(train_lbl.shape)

In [None]:
print(test_img.shape)

In [None]:
print(test_lbl.shape)

### Standardizing the Data
Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

Notebook going over the importance of feature Scaling: http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

### PCA to Speed up Machine Learning Algorithms (SVM)
Step 0: Import and use PCA. After PCA you will apply a machine learning algorithm of your choice to the transformed data

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Make an instance of the Model
pca = PCA(.95)

In [None]:
#Fit PCA on training set. Note: you are fitting PCA on the training set only
pca.fit(train_img)

In [None]:
pca.n_components_

In [None]:
#Apply the mapping (transform) to both the training set and the test set.
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

#### Step 1: Import the model you want to use
#### In sklearn, all machine learning models are implemented as Python classes

In [None]:
import datetime
x1 = datetime.datetime.now()
print(x1)

from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100) # các tham số cho mô hình hoạt động tốt hơn

#### Step 2: Training the model on the data, storing the information learned from the data
##### Model is learning the relationship between x (digits) and y (labels)

In [None]:
clf.fit(train_img,train_lbl)

In [None]:
y_pred = clf.predict(test_img)
y_pred

In [None]:
x2 = datetime.datetime.now()
print(x2)
d = x2 - x1
print(d)

#### Measuring Model Performance 
#### Basically, how the model performs on new data (test set)

In [None]:
from sklearn.metrics import accuracy_score 
print("Accuracy is ", accuracy_score(test_lbl,y_pred)*100,"%") 

In [None]:
score = clf.score(test_img, test_lbl)
print(score)

#### Step 3: Predict the labels of new data (new images)
#### Uses the information the model learned during the model training process

In [None]:
new = clf.predict(test_img[0].reshape(1,-1))
new

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')