# Face Alignment - Data Preprocessing

- Add the project's root directory (two levels up) to the Python path so the modules can be imported, even if they arent in the current working directory:

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

- Import the required libraries and modules, as well as our utility functions:

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

from src.utils import load_config, get_project_root, confirm_checksum, save_as_npz

- Load the config using the utility function. Get paths to relevant folders/files needed to save and retrieve files:

In [None]:
config = load_config()

raw_test_data_checksum = config['data']['task2']['raw']['test_checksum']
raw_train_data_checksum = config['data']['task2']['raw']['train_checksum']

train_data_path = os.path.join(get_project_root(), config['data']['task2']['raw']['train'])
test_data_path = os.path.join(get_project_root(), config['data']['task2']['raw']['test'])

processed_train_data_path = config['data']['task2']['processed']['train']
processed_test_data_path = config['data']['task2']['processed']['test']

raw_test_data = os.path.join(get_project_root(), test_data_path.replace('/', os.sep))
raw_train_data = os.path.join(get_project_root(), train_data_path.replace('/', os.sep))

- Use the provided function to check whether the contents of the files loaded are valid or not, by checking them against a provided checksum value:

In [None]:
if confirm_checksum(raw_test_data, raw_test_data_checksum) and confirm_checksum(raw_train_data, raw_train_data_checksum):
    print("Training and Testing Data Loaded Correctly!")

- Load the images and the landmark data. We can load the "npz" file by using numpy's load function:

In [None]:
train_data = np.load(train_data_path, allow_pickle=True)
test_data = np.load(test_data_path, allow_pickle=True)

In [None]:
train_images = train_data['images']
train_points = train_data['points']

- Print some key information about the shape of the training data:

In [None]:
print(f"Train Images: {train_images.shape}")
print(f"Train Points: {train_points.shape}")

In [None]:
test_images = test_data['images']

- Print some key information about the shape of the test data:

In [None]:
print(f"Test Images: {test_images.shape}")

- Plot a sample of 3 images with their landmarks. I am doing this to help visualise the training images and where abouts the landmarks are based:

In [None]:
plt.figure(figsize=(12, 4))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.imshow(train_images[i], cmap='gray')
    for pt in train_points[i]:
        plt.plot(pt[0], pt[1], 'ro')
    plt.axis('off')
    
plt.suptitle('Sample Images with Landmarks')
plt.tight_layout()
plt.show()

- This function is used to pass the image through some pre-processing steps. It converts images to grayscale, resizes them if needed and normalises pixel values:

In [None]:
def pre_process_images(images, target_size=(96, 96)):
    processsed = []
    for image in images:
        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
            
        resized = cv2.resize(image, target_size)
        normalised = resized / 255.0
        processsed.append(normalised)
    
    return np.array(processsed)

- This function will resize the landmark coordinates to match the size of the new image:

In [None]:
def resize_points(points, original_shape, target_shape):
    x = target_shape[1] / original_shape[1]
    y = target_shape[0] / original_shape[0]

    scaled_points = points.copy()
    scaled_points[:, :, 0] *= x
    scaled_points[:, :, 1] *= y

    return scaled_points

- Preprocess the training data and test data, making sure it is consistent throughout both datasets.

In [None]:
shape = train_images[0].shape[:2]
target_shape = (96, 96)

In [None]:
processed_train = pre_process_images(train_images, target_shape)
processed_test = pre_process_images(test_images, target_shape)
resized_train_points = resize_points(train_points, shape, target_shape)

- Visualise how the new preprocessed images look with the resized landmarks:

In [None]:
plt.figure(figsize=(12, 4))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.imshow(processed_train[i], cmap='gray')
    
    for pt in resized_train_points[i]:
        plt.plot(pt[0], pt[1], 'ro')
    plt.axis('off')

plt.suptitle('Preprocessed Images with Landmarks')
plt.tight_layout()
plt.show()

- Save the pre-processed data so it can be loaded to extract features. We are saving it as a "npz" file type to keep it consistent throughout the pipeline:

In [None]:
processed_train_data = os.path.join(get_project_root(), processed_train_data_path.replace('/', os.sep), "processed_face_alignment_train_images.npz")
processed_test_data = os.path.join(get_project_root(), processed_test_data_path.replace('/', os.sep), "processed_face_alignment_test_images.npz")

save_as_npz(processed_train_data, images=processed_train, points=resized_train_points)
save_as_npz(processed_test_data, images=processed_test)