# Chapter 8. Dimesionality Reduction

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## PCA

In [2]:
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

## Principal Components

In [3]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

## Projecting Down to d Dimensions

In [4]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [5]:
X2D_using_svd = X2D

## Using Scikit-Learn

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [7]:
X2D[:5]

array([[ 1.26203346,  0.42067648],
       [-0.08001485, -0.35272239],
       [ 1.17545763,  0.36085729],
       [ 0.89305601, -0.30862856],
       [ 0.73016287, -0.25404049]])

## Explained Varaince Ratio

In [12]:
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

##  Choosing the Right Number of Dimensions

In [15]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version = 1, as_frame=False)
mnist.target= mnist.target.astype(np.uint8)

In [16]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [19]:
d

154

In [21]:
pca = PCA(n_components=0.95)
X_reduced =pca.fit(X_train)

In [22]:
pca.n_components_

154

In [23]:
np.sum(pca.explained_variance_ratio_)

0.9503684424557436

## PCA for Compression

In [24]:
pca = PCA(n_components=154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

## Randomized PCA

In [25]:
rnd_pca = PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

## Incremental PCA

In [27]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca =IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    print(".", end="")
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

....................................................................................................

In [30]:
filename = "my_mnist.data"
m, n = X_train.shape
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m,n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components,batch_size=batch_size)
inc_pca.fit(X_mm)

FileNotFoundError: [Errno 2] No such file or directory: 'my_mnist.data'

## Exercise solutions

### 9.

In [14]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

mnist = fetch_openml('mnist_784',version=1,as_frame=False)
X = mnist["data"]
y = mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 60000, test_size=10000)

In [18]:
from sklearn.ensemble import RandomForestClassifier
import time

rfc = RandomForestClassifier( random_state=42)
start = time.time()
rfc.fit(X_train, y_train)
end = time.time()
print(end-start)

32.24467468261719


In [36]:
from sklearn.metrics import accuracy_score
y_pred =rfc.predict(X_test)
accuracy_score(y_test, y_pred )

0.9685

In [25]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
reduced_X = pca.fit_transform(X_train)

In [26]:
pca.n_components_

153

In [38]:
rfc2 = RandomForestClassifier(random_state=42)
reduced_X_test = pca.transform(X_test)
start2 = time.time()
rfc2.fit(reduced_X, y_train)
end2 = time.time()
print(end2-start2)

77.7782654762268


In [39]:
y_pred2 = rfc2.predict(reduced_X_test)
accuracy_score(y_test, y_pred2)

0.6365

### 10.