In [4]:
#PCA
import os
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
    
mnist = fetch_openml('mnist_784', version=1, cache=True)

X, y = mnist["data"], mnist["target"]

X_centered = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X_centered)
c1 = V.T[:, 0]
c2 = V.T[:, 1]

W2 = V.T[:, :2] #this is the projection part
X2D = X_centered.dot(W2)

MemoryError: Unable to allocate 36.5 GiB for an array with shape (70000, 70000) and data type float64

In [9]:
#Or simply use the Scikit-Learn library
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml

def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
    
mnist = fetch_openml('mnist_784', version=1, cache=True)

X, y = mnist["data"], mnist["target"]

pca = PCA(n_components = 2) #reducing dimensionality to two
'''pca = PCA() for no dimensionality reduction'''
X2D = pca.fit_transform(X)

# the variance ratio is visible this way
print(pca.explained_variance_ratio_)

#You can use this way to reduce dimensionality and keep 95% variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

# with n_components you can decide the number of dimension you want to keep
pca = PCA(n_components = 154)
X_mnist_reduced = pca.fit_transform(X)

# this reconstitutes the original data with losses
X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)

[0.09746116 0.07155445]


In [None]:
X_reduced

In [11]:
#Not enough memory on my computer to run this code...cool...

from sklearn.decomposition import IncrementalPCA

n_batches = 100

inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batch)

X_mnist_reduced = inc_pca.transform(X)

In [13]:
# this seems to be a faster PCA method as well
from sklearn.decomposition import PCA

rnd_pca = PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X)

# Kernel PCA...for when I can test it
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

MemoryError: Unable to allocate 36.5 GiB for an array with shape (70000, 70000) and data type float64

In [14]:
# unsupervised learning
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

Traceback (most recent call last):
  File "c:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "c:\python37\lib\site-packages\joblib\memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "c:\python37\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 305, in fit_transform
    self.fit(X, **params)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_kernel_pca.py", line 280, in fit
    K = self._get_kernel(X)
  File "c:\python37\lib\site-packages\sklearn\decomposition\_

KeyboardInterrupt: 

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)
