# Digit classification with PCA

In [None]:
import math

from sklearn.datasets import fetch_openml, load_digits
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix

## images with size 28x28

In [None]:
mnist = fetch_openml('mnist_784', version=1)

X = mnist.data.to_numpy()         
y = mnist.target.astype(np.int32)
print("Data shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique labels:", np.unique(y))
X.min(), X.max(), X.mean(), math.sqrt(X.shape[1]), y.dtype

In [None]:
plt.imshow(X[0].reshape(28, 28))
plt.title(f"Label: {y[0]}")
plt.show()

## Images with size 8x8

### load, scale, center data

In [None]:
digits = load_digits()
X8 = digits.data
y8 = digits.target

print("Data shape:", X8.shape)
print("Labels shape:", y8.shape)
print("Unique labels:", set(y8))
print("Grayscale range:", X8.max(), X8.min())

center = X8.mean(axis=0)
X8_sc = (X8 - center) / 16

index = 0
original_image = X8[index].reshape(8, 8)
centered_image = X8_sc[index].reshape(8, 8)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(f"Centered and scaled images, label: {y8[index]}")

# 1) Plot the first image + colorbar
im1 = ax1.imshow(original_image, cmap='gray', vmin=0, vmax=16)
cbar1 = fig.colorbar(im1, ax=ax1)
cbar1.set_label('Intensity')
ax1.set_title("Original")
ax1.axis('off')

# 2) Plot the second image + colorbar
im2 = ax2.imshow(centered_image, cmap='gray')
cbar2 = fig.colorbar(im2, ax=ax2)
cbar2.set_label('Centered intensity')
ax2.set_title("Centered and scaled")
ax2.axis('off')

plt.tight_layout()
plt.savefig("sample_digit.pdf")
plt.show()


In [None]:
X8_sc.max(), X8_sc.min(), X8_sc.mean()

### Show the center

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
avg_img = center.reshape(8, 8)

im = ax.imshow(avg_img, cmap='gray')
ax.set_title("Average digit image")

plt.colorbar(im)
plt.savefig("average_digit.pdf")
plt.show()

### Calcualte the cocariance matrix, eigenvalues and eigenvectors

In [None]:
cov_matrix = np.cov(X8_sc, rowvar=False) 
cov_matrix.shape

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

idx = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[idx]

fig, ax1 = plt.subplots()

# linear scale
line1 = ax1.plot(eigenvalues_sorted, marker='o', label='Linear scale')
ax1.set_xlabel("Component index")
ax1.set_ylabel("Eigenvalue (linear scale)")
ax1.set_ylim(0, eigenvalues_sorted.max()*1.1) 

# log scale
ax2 = ax1.twinx()
line2 = ax2.plot(eigenvalues_sorted, marker='x', label='Log scale', color='red')
ax2.set_ylabel("Eigenvalue (log scale)")
ax2.set_yscale('log')
ax2.set_ylim(0.3e-3, eigenvalues_sorted.max()*1.2)

# Combine legends
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='best')

plt.title("Eigenvalues in linear and log scale")
plt.savefig("eigenvalues.pdf")
plt.show()


#### show the eigenvectors

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

idx = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[idx]
eigenvectors_sorted = eigenvectors[:, idx]

num_components = 16
pcs = eigenvectors_sorted[:, :num_components]

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(8, 8))

for i in range(num_components):
    row, col = divmod(i, 4)
    pc_image = pcs[:, i].reshape(8, 8)
    
    axes[row, col].imshow(pc_image, cmap='gray')
    axes[row, col].axis('off')  
    axes[row, col].set_title(f"PC {i+1}")

plt.tight_layout()
plt.savefig("principal_components.pdf")
plt.show()


### Calculate the PCA for the whole dataset

In [None]:
K_values = [4, 8, 16, 32, 48, 64]
coords_dict = {}

for K in K_values:
    V_k = eigenvectors_sorted[:, :K]
    coords_K = X8_sc @ V_k
    coords_dict[K] = coords_K

### Plot the reconstructed images for the first few images, for different K

In [None]:
num_samples = 10

fig, axes = plt.subplots(
    nrows=num_samples,
    ncols=len(K_values) + 1,
    figsize=(8, 14),
    gridspec_kw={'width_ratios': [0.1] + [1]*len(K_values)}
)

for i in range(num_samples):
    label = y8[i]

    label_ax = axes[i, 0]
    label_ax.text(0.5, 0.5, f"{label}",
                  ha='center', va='center', fontsize=10)
    label_ax.axis('off')

    x_sc = X8_sc[i]
    for j, K in enumerate(K_values):
        col_idx = j + 1
        V_k = eigenvectors_sorted[:, :K]
        coords = coords_dict[K][i, :]  
        x_recon_sc = V_k @ coords
        x_recon = 16*x_recon_sc + center
        image_recon = x_recon.reshape(8, 8)

        ax = axes[i, col_idx]
        ax.imshow(image_recon, cmap='gray', vmin=0, vmax=16)
        ax.axis('off')
        if i == 0:
            ax.set_title(f"K={K}", fontsize=9)

plt.suptitle(f"Reconstruction of first {num_samples} samples \nwith increasing number of principal components (K)", y=0.93)
plt.savefig("reconstruction_with_pcs.pdf")
plt.show()

## Train LDA with different Ks

In [None]:
K = 4
X_pca = coords_dict[K] 
X_train, X_test, y_train, y_test = train_test_split(X_pca, y8,
                                                    test_size=0.2,
                                                    random_state=42)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure()
plt.imshow(cm, cmap='Blues')
plt.colorbar(label='Count')
plt.title(f"Confusion matrix for LDA with K={K}")
plt.xlabel("Predicted label")
plt.ylabel("True label")

plt.xticks(range(10))
plt.yticks(range(10))

for i in range(10):
    for j in range(10):
        plt.text(j, i, cm[i, j],
                 ha='center', va='center', color='red', fontsize=8)

plt.tight_layout()
plt.savefig(f"confusion_matrix_4.pdf")
plt.show()

In [None]:
X_pca4 = coords_dict[4]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accumulated_cm = np.zeros((10, 10), dtype=int)
for train_index, test_index in kf.split(X_pca4):
    X_train, X_test = X_pca4[train_index], X_pca4[test_index]
    y_train, y_test = y8[train_index], y8[test_index]
    
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    
    y_pred = lda.predict(X_test)
    fold_cm = confusion_matrix(y_test, y_pred)
    accumulated_cm += fold_cm

plt.figure()
plt.imshow(accumulated_cm, cmap='Blues')
plt.colorbar(label='Average count')
plt.title("5-Fold average confusion matrix (K=4)")
plt.xlabel("Predicted label")
plt.ylabel("True label")

plt.xticks(range(10))
plt.yticks(range(10))

for i in range(10):
    for j in range(10):
        plt.text(j, i, accumulated_cm[i, j],
                 ha='center', va='center',
                 color='red', fontsize=8)

plt.tight_layout()
plt.savefig("average_confusion_matrix_4_KFold.pdf")
plt.show()

In [None]:
digit_counts = np.zeros((len(K_values), 10))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, K in enumerate(K_values):
    X_pca = coords_dict[K]
    
    accumulated_cm = np.zeros((10, 10), dtype=int)
    
    for train_index, test_index in kf.split(X_pca):
        X_train, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y8[train_index], y8[test_index]
        
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        
        y_pred = lda.predict(X_test)
        
        fold_cm = confusion_matrix(y_test, y_pred, labels=range(10))
        accumulated_cm += fold_cm
    
    diag = np.diag(accumulated_cm)
    digit_counts[i, :] = diag

markers = ['o', 's', '^', 'v', '<', '>', 'd', 'p', 'x', '*']
linestyles = ['-', '--', '-.', ':', '-', '--', '-.', ':', '-', '--']


plt.figure()
ax = plt.gca()
ax.set_ylim([130,180])

for digit in range(10):
    plt.plot(
        K_values,
        digit_counts[:, digit],
        marker=markers[digit],
        linestyle=linestyles[digit],
        label=f"Digit {digit}"
    )

avg_diagonal = digit_counts.sum(axis=1) / 10.0

plt.plot(
    K_values,
    avg_diagonal,
    marker='D',
    linestyle='-',
    linewidth=2,
    color='black',
    label='mean'
)

plt.title("Average correct-class counts (diagonal) vs. K")
plt.xlabel("K (Number of PCA components)")
plt.ylabel("Correct-class count (test set)")
plt.legend(ncol=2)
plt.grid(True)
plt.savefig("average_confusion_matrix_vsK_KFold.pdf")
plt.show()
print("diagonal average: ", avg_diagonal)