## **Data Load**

In [1]:
# import all library

import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import timeit
import tracemalloc

In [2]:
mat_content = sio.loadmat('assets/face.mat')
# mat_content # Let's see the content...

face_data = mat_content['X']
face_labels = mat_content['l']

x_train, x_test, y_train, y_test = [], [], [], []

n_classes = 52
images_per_class = 10

for i in range(n_classes):
    start_idx = i * images_per_class
    end_idx = start_idx + images_per_class

    x_train.append(face_data[:, start_idx:start_idx+8])
    x_test.append(face_data[:, start_idx+8:end_idx])

    y_train.append(face_labels[:, start_idx:start_idx+8])
    y_test.append(face_labels[:, start_idx+8:end_idx])

x_train, x_test = np.hstack(x_train), np.hstack(x_test)
y_train, y_test = np.hstack(y_train), np.hstack(y_test)

x_train = x_train.astype(np.int64)
x_test = x_test.astype(np.int64)

# Output the shapes of the training and testing sets
print(f"Training set shape: {x_train.shape}")
print(f"Test set shape: {x_test.shape}")

Training set shape: (2576, 416)
Test set shape: (2576, 104)


## **Q2. Incremental PCA**

creating subdatasets

In [3]:
x_sub1, x_sub2, x_sub3, x_sub4 = [], [], [], []
y_sub1, y_sub2, y_sub3, y_sub4 = [], [], [], []

n_classes = 52
images_per_class = 8

for i in range(n_classes):
  x_sub1.append(x_train[:,i*images_per_class :i*images_per_class + 2])
  x_sub2.append(x_train[:,i*images_per_class+2 :i*images_per_class + 4])
  x_sub3.append(x_train[:,i*images_per_class+4 :i*images_per_class + 6])
  x_sub4.append(x_train[:,i*images_per_class+6 :i*images_per_class + 8])

  y_sub1.append(y_train[:,i*images_per_class :i*images_per_class + 2])
  y_sub2.append(y_train[:,i*images_per_class+2 :i*images_per_class + 4])
  y_sub3.append(y_train[:,i*images_per_class+4 :i*images_per_class + 6])
  y_sub4.append(y_train[:,i*images_per_class+6 :i*images_per_class + 8])

x_sub1, x_sub2, x_sub3, x_sub4 = np.hstack(x_sub1), np.hstack(x_sub2), np.hstack(x_sub3), np.hstack(x_sub4)
y_sub1, y_sub2, y_sub3, y_sub4 = np.hstack(y_sub1), np.hstack(y_sub2), np.hstack(y_sub3), np.hstack(y_sub4)

x_subs = [x_sub1, x_sub2, x_sub3, x_sub4]
y_subs = [y_sub1, y_sub2, y_sub3, y_sub4]

1. Incremental learning

In [4]:
def run_ipca():
  ipca = IncrementalPCA(n_components = 5)
  
  for subset in x_subs:
    ipca.partial_fit(subset.T)
  
  return ipca 

def ipca_nn(ipca):
  # Transform training subsets
  W_train_sub1 = ipca.transform(x_sub1.T).T
  W_train_sub2 = ipca.transform(x_sub2.T).T
  W_train_sub3 = ipca.transform(x_sub3.T).T
  W_train_sub4 = ipca.transform(x_sub4.T).T

  # Optionally combine transformed training data if needed
  W_train = np.vstack((W_train_sub1.T, W_train_sub2.T, W_train_sub3.T, W_train_sub4.T))

  # Fit a classifier (e.g., k-NN) on the transformed training data
  nn = KNeighborsClassifier(n_neighbors=1, metric='manhattan')
  # You may need to combine y_train corresponding to each subset for training
  y_train_combined = np.concatenate([y_sub1.T, y_sub2.T, y_sub3.T, y_sub4.T])  # Combine true labels

  nn.fit(W_train, y_train_combined.ravel())

  # Transform the test data
  W_test = ipca.transform(x_test.T).T

  # Predict using the classifier
  y_pred = nn.predict(W_test.T)

  # Calculate accuracy
  accuracy = accuracy_score(y_test.T, y_pred)
  print(f"Accuracy: {accuracy * 100:.2f}%")


tracemalloc.start() 
start_time = timeit.default_timer()  # Start timing

run_ipca()

current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
end_time = timeit.default_timer()  # Stop timing

print(f"Current memory usage: {current / 10**6} MB")
print(f"Peak memory usage: {peak / 10**6} MB")
print(f"Execution Time: {end_time - start_time:.8f} seconds")
ipca = run_ipca()
ipca_nn(ipca)

Current memory usage: 0.035434 MB
Peak memory usage: 11.59286 MB
Execution Time: 0.13093221 seconds
Accuracy: 38.46%


2. Batch PCA

In [5]:
def batch_pca_nn():
  # Step 1: Get PCA  result from training data & project all training data
  batch_pca = PCA(n_components = 5)
  batch_pca.fit(x_train.T)
  return batch_pca

def batch_nn(batch_pca):
  W_train = batch_pca.transform(x_train.T).T

  # Step 2: Project test data
  W_test = batch_pca.transform(x_test.T).T

  # Step 3: Get result using NN classifier
  nn = KNeighborsClassifier(n_neighbors=1, metric='manhattan')
  nn.fit(W_train.T, y_train.reshape(-1))
  y_pred = nn.predict(W_test.T)

  # Step 4: Calculate accuracy
  accuracy = accuracy_score(y_test.T, y_pred)
  print(f"Accuracy: {accuracy * 100:.2f}%")


tracemalloc.start() 
start_time = timeit.default_timer()  # Start timing

batch_pca_nn()

current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
end_time = timeit.default_timer()  # Stop timing

print(f"Current memory usage: {current / 10**6} MB")
print(f"Peak memory usage: {peak / 10**6} MB")
print(f"Execution Time: {end_time - start_time:.8f} seconds")

batch_pca = batch_pca_nn()
batch_nn(batch_pca)

Current memory usage: 0.016898 MB
Peak memory usage: 9.616852 MB
Execution Time: 0.03387150 seconds
Accuracy: 39.42%


3. train with only first subset

In [6]:
def sub1_pca_nn():
  # Step 1: Get PCA  result from training data & project all training data
  sub1_pca = PCA(n_components = 5)
  sub1_pca.fit(x_sub1.T)
  return sub1_pca

def sub1_nn(sub1_pca):
  W_train = sub1_pca.transform(x_sub1.T).T

  # Step 2: Project test data
  W_test = sub1_pca.transform(x_test.T).T

  # Step 3: Get result using NN classifier
  nn = KNeighborsClassifier(n_neighbors=1, metric='manhattan')
  nn.fit(W_train.T, y_sub1.reshape(-1))
  y_pred = nn.predict(W_test.T)

  # Step 4: Calculate accuracy
  accuracy = accuracy_score(y_test.T, y_pred)
  print(f"Accuracy: {accuracy * 100:.2f}%")


tracemalloc.start() 
start_time = timeit.default_timer()  # Start timing

sub1_pca_nn()

current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
end_time = timeit.default_timer()  # Stop timing

print(f"Current memory usage: {current / 10**6} MB")
print(f"Peak memory usage: {peak / 10**6} MB")
print(f"Execution Time: {end_time - start_time:.8f} seconds")

sub1_pca = sub1_pca_nn()
sub1_nn(sub1_pca)

Current memory usage: 0.003042 MB
Peak memory usage: 3.136444 MB
Execution Time: 0.01315092 seconds
Accuracy: 17.31%
