### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Preprocessing

In [None]:
def read_image_file(path):
    with open(path, "rb") as f:
        data = f.read()
        num_cols = int(data[3:5])
        num_rows = int(data[6:9])
        parsed = np.frombuffer(data, dtype=np.uint8, offset=14)
        return np.array(parsed).reshape(num_rows, num_cols)

In [None]:
# visaulize images of id = 6
id = 5
plt.figure(figsize=(8,8))
for j in range(10):
    img = read_image_file(f"faces/s{id+1}/{j+1}.pgm")
    plt.subplot(5,5,(j+1))
    plt.axis('off')
    plt.imshow(img, cmap='gray')


In [None]:
# visaulize images 1 of all ids 
j = 0
plt.figure(figsize=(40,40))
for id in range(40):
    img = read_image_file(f"faces/s{id+1}/{j+1}.pgm")
    plt.subplot(10,10,id+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')

In [None]:
# generating Data Matrix
D = np.array([0]*92*112, dtype=int) # int or float ??
y = np.array([], dtype=int)

for i in range(40):
    for j in range(10):
        img = read_image_file(f"faces/s{i+1}/{j+1}.pgm")
        img = img.reshape(-1)
        D = np.vstack((D, img))
        y = np.append(y,i+1)
D = np.delete(D, 0, axis=0)

# shape of Data Matrix and label vector
print(D.shape, y.shape)

In [None]:
# splitting dataset into training and testing sets
odd_rows = np.array([i % 2 != 0 for i in range(400)]) 

# odds rows for taining set
D_train = D[odd_rows]
y_train = y[odd_rows]

# even rows for testing set
D_test = D[~ odd_rows]
y_test = y[~ odd_rows]

# shapes of resulting dataset
print(D_train.shape, y_train.shape)
print(D_test.shape, y_test.shape)


### PCA Implementation

In [None]:
class PCA():
    def __init__(self, alpha=0.85):
        self.alpha = alpha

    def set_alpha(self, alpha):
        self.alpha = alpha

    def project(self, D, eigen_values=None,eigen_vectors=None):
        # center the data matrix
        D = D - np.mean(D, axis=0)

        if eigen_values is None or eigen_vectors is None:
            # get sorted eigen values, vectors of cov_mat(D) 
            eigen_values, eigen_vectors = self.get_sorted_eig(D)
        
        # count PCs from aplha
        num_components = self.count_num_components(eigen_values)

        # get projection matrix with num_components
        projection_mat = eigen_vectors[:,:num_components]
        # compute projected data
        return np.dot(D, projection_mat)

    def get_sorted_eig(self, D):
        # get Cov matrix
        cov_mat = np.cov(np.transpose(D), bias=True)

        eigen_values, eigen_vectors = np.linalg.eigh(cov_mat) 

        sort_indices = np.flip(np.argsort(eigen_values))

        eigen_values = np.sort(eigen_values)[::-1]
        eigen_vectors = eigen_vectors[:, sort_indices] 
        
        return eigen_values, eigen_vectors

    def count_num_components(self, eigen_values):
        n = 1
        total_variance = np.sum(eigen_values)
        sum = 0
        for i in range(len(eigen_values)):
            sum = sum + eigen_values[i]
            var_explained =  sum / total_variance
            if var_explained >= self.alpha:
                break
            n = n + 1

        return n


In [None]:
# compute eigen values and eigen vectors of trained data
pca = PCA()
eigen_values, eigen_vectors = pca.get_sorted_eig(D_train)

# save eigen values and eigen vectors
np.save("train_eigen_val", eigen_values)
np.save("train_eigen_vec", eigen_vectors)

In [None]:
# load eigen values and eigen vectors of trained data 
eigen_values = np.load("train_eigen_val.npy")
eigen_vectors = np.load("train_eigen_vec.npy")

# PCA with different aplhas
alphas = [.8, .85, .9, .95]
projected_Ds = {}

for alpha in alphas:
    pca.set_alpha(alpha)
    pca.count_num_components(eigen_values)
    projected_Ds[alpha] = pca.project(D_train, eigen_values, eigen_vectors)
    print(f"Dimensions at alpha= {alpha} : {projected_Ds[alpha].shape}")

In [None]:
# visaulize faces in PCA space (Eigenfaces)
plt.figure(figsize=(10,10))
for i in range(10):
    img = eigen_vectors[:,i]
    plt.subplot(5,5,i+1)
    plt.axis('off')
    plt.title(f"Eigen value {i+1}")
    plt.imshow(img.reshape(112,92), cmap='gray')