### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Preprocessing

In [None]:
def read_image_file(path):
    with open(path, "rb") as f:
        data = f.read()
        num_cols = int(data[3:5])
        num_rows = int(data[6:9])
        parsed = np.frombuffer(data, dtype=np.uint8, offset=14)
        return np.array(parsed).reshape(num_rows, num_cols)

In [None]:
# visaulize images of id = 6
id = 5
plt.figure(figsize=(8,8))
for j in range(10):
    img = read_image_file(f"faces/s{id+1}/{j+1}.pgm")
    plt.subplot(5,5,(j+1))
    plt.axis('off')
    plt.imshow(img, cmap='gray')


In [None]:
# visaulize images 1 of all ids 
j = 0
plt.figure(figsize=(40,40))
for id in range(40):
    img = read_image_file(f"faces/s{id+1}/{j+1}.pgm")
    plt.subplot(10,10,id+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')

In [None]:
# generating Data Matrix
D = np.array([0]*92*112, dtype=int) # int or float ??
y = np.array([], dtype=int)

for i in range(40):
    for j in range(10):
        img = read_image_file(f"faces/s{i+1}/{j+1}.pgm")
        img = img.reshape(-1)
        D = np.vstack((D, img))
        y = np.append(y,i+1)
D = np.delete(D, 0, axis=0)

# shape of Data Matrix and label vector
print(D.shape, y.shape)

In [None]:
# splitting dataset into training and testing sets
odd_rows = np.array([i % 2 != 0 for i in range(400)]) 

# odds rows for taining set
D_train = D[odd_rows]
y_train = y[odd_rows]

# even rows for testing set
D_test = D[~ odd_rows]
y_test = y[~ odd_rows]

# shapes of resulting dataset
print(D_train.shape, y_train.shape)
print(D_test.shape, y_test.shape)


### PCA Implementation

In [None]:
class PCA():
    def __init__(self, alpha=0.85):
        self.alpha = alpha

    def set_alpha(self, alpha):
        self.alpha = alpha

    def project(self, D, eigen_values=None,eigen_vectors=None):
        # center the data matrix
        D = D - np.mean(D, axis=0)

        if eigen_values is None or eigen_vectors is None:
            # get sorted eigen values, vectors of cov_mat(D) 
            eigen_values, eigen_vectors = self.get_sorted_eig(D)
        
        # count PCs from aplha
        num_components = self.count_num_components(eigen_values)

        # get projection matrix with num_components
        projection_mat = eigen_vectors[:,:num_components]
        # compute projected data
        return D @ projection_mat

    def get_sorted_eig(self, D):
        # center the data matrix and get cov matrix
        D = D - np.mean(D, axis=0)
        cov_mat = np.cov(D, bias=True, rowvar=False)

        eigen_values, eigen_vectors = np.linalg.eigh(cov_mat) 

        sort_indices = np.flip(np.argsort(eigen_values))

        eigen_values = np.sort(eigen_values)[::-1]
        eigen_vectors = eigen_vectors[:, sort_indices] 
        
        return eigen_values, eigen_vectors

    def count_num_components(self, eigen_values, alpha=None):
        if alpha is None:
            alpha = self.alpha
            
        n = 1
        total_variance = np.sum(eigen_values)
        sum = 0
        for i in range(len(eigen_values)):
            sum = sum + eigen_values[i]
            var_explained =  sum / total_variance
            if var_explained >= alpha:
                break
            n = n + 1

        return n


In [None]:
# compute eigen values and eigen vectors of trained data
pca = PCA()
eigen_values, eigen_vectors = pca.get_sorted_eig(D_train)

# save eigen values and eigen vectors
np.save("train_eigen_val", eigen_values)
np.save("train_eigen_vec", eigen_vectors)

In [None]:
# load eigen values and eigen vectors of trained data 
eigen_values = np.load("train_eigen_val.npy")
eigen_vectors = np.load("train_eigen_vec.npy")

# PCA with different aplhas
alphas = [.8, .85, .9, .95]
# D_projections {alpha: projected_data} 
D_projections = {}

for alpha in alphas:
    pca.set_alpha(alpha)
    pca.count_num_components(eigen_values)
    D_projections[alpha] = pca.project(D_train, eigen_values, eigen_vectors)
    print(f"Dimensions at alpha= {alpha} : {D_projections[alpha].shape}")


In [None]:
# reconstruct images of some samples with different number of components
sample_faces = [0, 5]
num_components = [pca.count_num_components(eigen_values, alpha) for alpha in alphas]

reconstruct_images = {}
j = 0
for  k in sample_faces:
    for i in num_components:
        img = D_train[k] @ (eigen_vectors[:,:i] @ eigen_vectors[:,:i].T) + np.mean(D_train, axis=0)
        reconstruct_images[(k,i)] = img
        j = j + 1


In [None]:
# visaulize reconstucted faces in PCA space
plt.figure(figsize=(8,8))
j = 1
for key, img in reconstruct_images.items():
        plt.subplot(4,4,j)
        plt.axis('off')
        plt.title(f"sample face {key[0]+1}\n{key[1]} components",fontsize=10)
        plt.imshow(img.reshape(112,92), cmap='gray')
        j = j + 1
plt.tight_layout()

In [None]:
# visaulize Eigenfaces
plt.figure(figsize=(10,10))
for i in range(10):
    img = eigen_vectors[:,i]
    plt.subplot(5,5,i+1)
    plt.axis('off')
    plt.title(f"Eigen value {i+1}")
    plt.imshow(img.reshape(112,92), cmap='gray')

### K-Means Clustering

In [None]:
np.random.seed(42)

class KMeans():
    def __init__(self, K, rand_start=20):
        self.K = K
        self.rand_start = rand_start
    
    def setK(self, K):
        self.K = K

    def fit(self, D, y):
        r_best = np.array([])
        min_inertia = np.inf

        # apply kmeans (rand_start) times
        for i in range(self.rand_start):
            # set K random centeriods: centers (K, # features)
            centers = np.random.rand(self.K, D.shape[1]) * np.random.randint(np.max(D))

            # intialize responsibility matrix(1-hot encode) with zeroes: r (# samples, k)
            r = np.zeros((D.shape[0], self.K), dtype=bool)
            
            r, centers = self.kmeans_loop(D, r, centers)

            # evaluate best cluster using interia
            err = self.inertia(D, r, centers)
            if  err < min_inertia:
                r_best = r
                min_inertia = err
        
        return r_best
        
    def kmeans_loop(self, D, r, centers):
        r_prev = r.copy()

        while True:
            # compute Distance matrix for each data point x=D[i] with each center
            # assign cluster for each data point
            for i in range(len(D)):
                cluster = np.argmin(np.linalg.norm(centers - D[i], axis=1))
                r[i] = [j == cluster for j in range(self.K)]

            # test for convergence
            if np.array_equal(r_prev, r):
                break
            else:
                r_prev = r.copy()
            
            # refitting center of each cluster
            for j in range(self.K):
                centers[j] = np.mean(D[r[:,j]], axis=0)
        
        return r, centers
    
    def inertia(self, D, r, centers):
        # sum of Euclidean dist from each point to its cluster
        sum = 0
        for j in range(self.K):
            cluster_dis = np.linalg.norm(D[r[:,j]] - centers[j], axis=1)
            sum = sum + np.sum(cluster_dis)

        return sum
            