In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml, load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## MNIST

In [None]:
X,y = fetch_openml(data_id=554, return_X_y=True, as_frame=False,parser='auto')

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# X2 = X/255
# mean = X2.mean(axis=0)
# X_centered = (X2 - mean)

In [None]:
plt.imshow(X[0,:].reshape(28,28), cmap='Blues');

In [None]:
fig, ax = plt.subplots(10, 10, figsize=(8, 8), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, axi in enumerate(ax.flat):
    im = axi.imshow(X[i].reshape(28, 28), cmap='Blues')


In [None]:
#keep just a subset of digits
keep = pd.Series(y).isin(['0','1'])
Xsub = X_scaled[keep].copy()
ysub = y[keep].copy()

In [None]:
Xplot = Xsub#+mean
fig, ax = plt.subplots(5, 5, figsize=(8, 8), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, axi in enumerate(ax.flat):
    im = axi.imshow(Xplot[i].reshape(28, 28), cmap='Blues')

## PCA with eigenvalues and eigenvectors

In [None]:
# covariance matrix
sigma = np.cov(Xsub.T)

In [None]:
sigma.shape

In [None]:
#sigma[350:360,350:360]

In [None]:
# compute eigen values and vectors
eig_val, eig_vec = np.linalg.eigh(sigma)

# sort eigen values and vectors
sorted_indices = np.argsort(eig_val)[::-1]
eig_vec = eig_vec[:, sorted_indices]  
eig_val = eig_val[sorted_indices] 

In [None]:
# Extract the first k eigenvectors
k = 30
V2 = eig_vec[:, :k]

# Get the PC projections
pc2 = Xsub.dot(V2)
c1 = pc2[:, 0]
c2 = pc2[:, 1]

In [None]:
# get the pc scores (the projections of the data on the PCs)
pc_scores = Xsub @ V2

In [None]:
plt.scatter(c1, c2, c=ysub.astype(int), cmap='viridis', edgecolor='k', s=40);

## PCA with sklearn tools

In [None]:
# 
pca = PCA(n_components=.8)  #n_components=2

In [None]:
X_pca = pca.fit(Xsub)

In [None]:
pca.n_components_

In [None]:
# get the pc
X_transform = X_pca.transform(Xsub)

In [None]:
X_transform.shape

In [None]:
rf = RandomForestClassifier()
cross_val_score(rf, Xsub, ysub).mean()

In [None]:
rf = RandomForestClassifier()
cross_val_score(rf, X_transform, ysub).mean()

In [None]:
# plot first two pc

pc1 = X_transform[:,0]
pc2 = X_transform[:,1]
labels = ysub.astype(int)
fig, ax = plt.subplots()
scatter = ax.scatter(pc1, pc2, c=labels)

legend = ax.legend(*scatter.legend_elements(),
                    loc="best", title="Lables")
ax.add_artist(legend)


In [None]:
# Calculate the loadings
# (relative importance of the original features to the principal components)
eigenvectors = pca.components_
eigenvalues = pca.explained_variance_
loadings = eigenvectors.T * np.sqrt(eigenvalues)

In [None]:
pca.explained_variance_ratio_[0]

In [None]:
fig, ax = plt.subplots(5, 4, figsize=(8, 8), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, axi in enumerate(ax.flat):
    im = axi.imshow(loadings.T[i].reshape(28, 28), cmap='Blues')

In [None]:
Xsub = np.random.normal(size=(14780, 784))

In [None]:
Xsub.shape

In [None]:
# compress data
digits_new = pca.inverse_transform(X_transform)

In [None]:
digits_new.shape

In [None]:
digits_new_plot = digits_new#+mean

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(8, 8), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, axi in enumerate(ax.flat):
    im = axi.imshow(digits_new_plot[i].reshape(28, 28), cmap='Blues')