In [1]:
import numpy as np

In [2]:
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

In [3]:
angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [10]:
X[0]

array([-1.01570027, -0.55091331, -0.26132626])

In [11]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [12]:
X.shape

(60, 3)

In [13]:
m, n = X.shape

S = np.zeros(X_centered.shape)
S[:n, :n] = np.diag(s)

In [49]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [50]:
X2 = pca.fit_transform(X)

In [51]:
X2[:5]

array([[-1.26203346, -0.42067648],
       [ 0.08001485,  0.35272239],
       [-1.17545763, -0.36085729],
       [-0.89305601,  0.30862856],
       [-0.73016287,  0.25404049]])

In [53]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [76]:
mnist.target = mnist.target.astype(int)

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
X = mnist["data"]
y = mnist["target"]

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [80]:
pca = PCA()

In [81]:
pca.fit(X_train)

In [82]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
cumsum

array([0.09738955, 0.16872029, 0.23017733, 0.28429328, 0.33337337,
       0.3764545 , 0.40936979, 0.43812581, 0.46554021, 0.48892567,
       0.51006524, 0.53044965, 0.54757132, 0.56451571, 0.58034569,
       0.59520727, 0.60848088, 0.62125615, 0.63316134, 0.64465466,
       0.65531355, 0.66540032, 0.6749692 , 0.68404024, 0.69286801,
       0.70123137, 0.70934695, 0.71724344, 0.72468345, 0.73160283,
       0.73817836, 0.74462198, 0.75062654, 0.75647103, 0.76213463,
       0.76756089, 0.77262611, 0.77751627, 0.78230714, 0.78696358,
       0.79150672, 0.79595178, 0.80011982, 0.80407497, 0.80791103,
       0.81165745, 0.81528586, 0.8187898 , 0.82217041, 0.82536693,
       0.82854193, 0.83164165, 0.83460188, 0.83747187, 0.84031522,
       0.84302771, 0.84572177, 0.84828888, 0.85083093, 0.85327773,
       0.85568837, 0.85807118, 0.86036364, 0.86257135, 0.86470322,
       0.86677035, 0.86879506, 0.87074448, 0.87265969, 0.87454179,
       0.87640434, 0.87821341, 0.87998379, 0.88172213, 0.88337

In [83]:
d = np.argmax(cumsum >= 0.95) + 1

In [84]:
d

np.int64(154)

In [85]:
rnd_pca = PCA(n_components = 154, svd_solver="randomized", random_state=42)

In [86]:
X_reduced = rnd_pca.fit_transform(X_train)

In [95]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100

inc_pca = IncrementalPCA(n_components=154)

for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

In [99]:
filename = "my_mnist.data"
m, n = X_train.shape

X_mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))
X_mm[:] = X_train

In [100]:
del X_mm

In [101]:
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)