### Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt

### Preprocessing

In [None]:
def read_image_file(path):
    with open(path, "rb") as f:
        data = f.read()
        num_cols = int(data[3:5])
        num_rows = int(data[6:9])
        parsed = np.frombuffer(data, dtype=np.uint8, offset=14)
        return np.array(parsed).reshape(num_rows, num_cols)

In [None]:
# visaulize 8th individual
id = 8
plt.figure(figsize=(8,8))
for j in range(1,11):
    img = read_image_file(f"faces/s{id}/{j}.pgm")
    plt.subplot(4, 5, j)
    plt.axis('off')
    plt.imshow(img, cmap='gray')

In [None]:
# visaulize 1st image of all individuals 
plt.figure(figsize=(40,40))
for id in range(1,41):
    img = read_image_file(f"faces/s{id}/{1}.pgm")
    plt.subplot(10,10,id)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
plt.tight_layout()
plt.show()

In [None]:
# 1-40 individuals, 10 images each, each image is 112x92

# Data Matrix
D = np.empty((0, 112*92), dtype=float) # number of images [rows] x number of pixels [columns]
# label vector
y = np.array([], dtype=int)

for id in range(1, 41):
    for j in range(1, 11):
        img = read_image_file(f"faces/s{id}/{j}.pgm")
        D = np.vstack((D, img.reshape(1, -1).astype(float)))
        y = np.append(y,id)

# shape of Data Matrix and label vector
print(D.shape, y.shape)

In [None]:
# splitting dataset into training and testing sets
odd_rows = np.array([i % 2 != 0 for i in range(400)]) 

# odds rows for taining set
D_train = D[odd_rows]
y_train = y[odd_rows]

# even rows for testing set
D_test = D[~ odd_rows]
y_test = y[~ odd_rows]

# shapes of resulting dataset
print(D_train.shape, y_train.shape)
print(D_test.shape, y_test.shape)

### PCA Implementation

In [None]:
class PCA():
    def __init__(self, Data):

        self.CenteredData = Data - np.mean(Data, axis=0)
        self.calc_eig()

    def project(self, alpha=0.85):
        """
        Project the data into the PCA space.
        :param alpha: The percentage of variance to be explained.
        :return: The projected data.
        """
        # check if alpha is between 0 and 1
        if alpha < 0 or alpha > 1:
            raise ValueError("alpha must be between 0 and 1")

        # count PCs from aplha
        num_components = self.count_num_components(alpha)

        # get projection matrix with num_components
        projection_mat = self.eigen_vectors[:,:num_components]

        # compute projected data
        return self.CenteredData @ projection_mat
    
    def calc_eig(self):
        """
        Calculate the eigenvalues and eigenvectors of the covariance matrix.
        :return: The sorted eigenvalues and eigenvectors.
        """
        # covariance matrix
        cov_mat = np.cov(self.CenteredData, rowvar=False)

        # eigenvalues and eigenvectors
        self.eigen_values, self.eigen_vectors = np.linalg.eigh(cov_mat) 

        # sort indices
        sort_idx = np.argsort(self.eigen_values)[::-1]
        self.eigen_values  = self.eigen_values[sort_idx]
        self.eigen_vectors = self.eigen_vectors[:, sort_idx]

    def count_num_components(self, alpha):
        """
        Count the number of components needed to explain a certain percentage of variance.
        :param alpha: The percentage of variance to be explained.
        :return: The number of components needed.
        """
        total_var = np.sum(self.eigen_values)
        cumvar = np.cumsum(self.eigen_values) / total_var

        num_components = np.searchsorted(cumvar, alpha) + 1
        return num_components

In [None]:
pca = PCA(D_train)
# eigen values
eigen_values = pca.eigen_values
# eigen vectors
eigen_vectors = pca.eigen_vectors

print(f"Eigen values shape: {eigen_values.shape}")
print(f"Eigen vectors shape: {eigen_vectors.shape}")

In [None]:
# PCA with different aplhas
alphas = [.8, .85, .9, .95]
# D_projections {alpha: projected_data} 
D_projections = {}

for alpha in alphas:
    D_projections[alpha] = pca.project(alpha)
    print(f"Dimensions at alpha= {alpha} : {D_projections[alpha].shape}")

In [None]:
# reconstruct images of some samples with different number of components
sample_faces = [0, 5]
num_components = [10, 50, 100, 150, 200]
reconstruct_images = {}
j = 0
for  k in sample_faces:
    for i in num_components:
        img = D_train[k] @ (eigen_vectors[:,:i] @ eigen_vectors[:,:i].T) + np.mean(D_train, axis=0)
        reconstruct_images[(k,i)] = img
        j = j + 1

In [None]:
# visaulize reconstucted faces in PCA space
plt.figure(figsize=(10,10))
j = 1
for key, img in reconstruct_images.items():
        plt.subplot(5,5,j)
        plt.axis('off')
        plt.title(f"sample face {key[0]+1}\n{key[1]} components",fontsize=10)
        plt.imshow(img.reshape(112,92), cmap='gray')
        j = j + 1
plt.tight_layout()

In [None]:
# Display the Eigenfaces with tup 10 eigen values
plt.figure(figsize=(20, 20))
for i in range(10):
    plt.subplot(4, 5, i + 1)
    plt.axis('off')
    plt.imshow(eigen_vectors[:, i].reshape(112, 92), cmap='gray')
    plt.title(f"Eigenface {i + 1} ({eigen_values[i]:.2f})")
plt.tight_layout()
plt.show()