In [1]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm
from skimage import filters, feature
import matplotlib.pyplot as plt
%matplotlib inline
from skimage.io import imread, imshow

In [2]:
# Config
celeba_directory = "/Users/macbookpro/UCL - MSc Integrated Machine Learning Systems/Y1/Applied Machine Learning I/Final Assignment/AMLS_20-21_SN17024244/Datasets/dataset_AMLS_20-21/celeba/"
celeba_images_subdirectory = celeba_directory + "img/"
extension = ".jpg"

In [3]:
# Count number of .jpg files available
def count_n_images(path, extension):
    
    list_dir = []
    list_dir = os.listdir(path)
    count = 0
    for file in list_dir:
        if file.endswith(extension):
            count += 1
    
    return count

In [4]:
# Load labels of our dataset
def load_images_label_csv():
    
    FILE_NAME = "labels.csv"
    SEPARATOR = "\t"
    USECOLS = ["img_name", "gender", "smiling"]
    
    labels_file = pd.read_csv(celeba_directory + FILE_NAME, sep=SEPARATOR, usecols=USECOLS)
    
    return labels_file

In [5]:
# Load array of with n_rows = n_images and each row is array of pixels
def load_images_array(image_size=(218, 178, 3)):
    
    n = count_n_images(celeba_images_subdirectory, extension)
    m,r,q = image_size
    
    X = np.empty((n,m,r,q), dtype=np.uint8) # OpenCV works with this data format
    files = os.listdir(celeba_images_subdirectory)
    for x, jpg in enumerate(files):
        image = cv2.imread(os.path.join(celeba_images_subdirectory, jpg))
        X[x] = image
        
    return X

In [14]:
def load_images_flattened_vectors(image_size=(218, 178, 3), image_processing=None):
    
    X = load_images_array(image_size)
    
    if image_processing == "CannyEdgeAlgorithm":
        gray_converter = lambda t: cv2.cvtColor(t, cv2.COLOR_BGR2GRAY)
        X = np.array([gray_converter(img) for img in X])
        X = np.array([feature.canny(gray_img) for gray_img in X])
        Xt = X.reshape((5000, 218*178))
    elif image_processing == "MeanPixels":
        X = np.mean(X, axis=3)
        Xt = X.reshape((5000, 218*178))
    else:
        Xt = X.reshape((5000, 218*178*3))
    
    return Xt

In [7]:
load_images_array().shape

(5000, 218, 178, 3)

In [8]:
# In task A1 we are interesting in guessing Male vs Female
y = load_images_label_csv()["gender"].values

### Method 1 (no preprocessing)

In [9]:
Xt_1 = load_images_flattened_vectors(image_size=(218, 178, 3))
print(Xt_1.shape)

(5000, 116412)


In [10]:
# Splitting the training and test sets
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(Xt_1, y, test_size=0.4, shuffle=True, random_state=42)

In [11]:
# Using linear model with Stochastic Gradient Descent
# Our loss function could also be set as "log" for logistic regression. Here we apply Support Vector Machine
classifier = lm.SGDClassifier(loss="hinge")
classifier.fit(xtrain1, ytrain1)

SGDClassifier()

In [12]:
# We can see SVM performs poorly on flattened vector of pixels
classifier.score(xtest1, ytest1)

0.5015

### Method 2 (Canny Edge Detection)
Now we are going to compare this performance with SVM applied on a preprocessed (Canny Edge Detection) vector.

In [15]:
Xt_2 = load_images_flattened_vectors(image_size=(218, 178, 3), image_processing="CannyEdgeAlgorithm")
print(Xt_2.shape)

(5000, 38804)


In [16]:
# Splitting the training and test sets
xtrain2, xtest2, ytrain2, ytest2 = train_test_split(Xt_2, y, test_size=0.4, shuffle=True, random_state=42)

In [17]:
# Using linear model with Stochastic Gradient Descent
# Our loss function could also be set as "log" for logistic regression. Here we apply Support Vector Machine
classifier = lm.SGDClassifier(loss="hinge")
classifier.fit(xtrain2, ytrain2)

SGDClassifier()

In [18]:
# We can see SVM performs poorly on "Canny-edited" flattened vector of pixels
classifier.score(xtest2, ytest2)

0.497

### Method 3 (Mean of pixel values)
Now we are going to compare this performance with SVM applied on a preprocessed (Mean Pixels) vector.

In [19]:
Xt_3 = load_images_flattened_vectors(image_size=(218, 178, 3), image_processing="CannyEdgeAlgorithm")
print(Xt_3.shape)

(5000, 38804)


In [22]:
# Splitting the training and test sets
xtrain3, xtest3, ytrain3, ytest3 = train_test_split(Xt_3, y, test_size=0.4, shuffle=True, random_state=42)

In [23]:
# Using linear model with Stochastic Gradient Descent
# Our loss function could also be set as "log" for logistic regression. Here we apply Support Vector Machine
classifier = lm.SGDClassifier(loss="hinge")
classifier.fit(xtrain3, ytrain3)

SGDClassifier()

In [24]:
# We can see SVM performs poorly on "Mean-of-RGB" flattened vector of pixels
classifier.score(xtest3, ytest3)

0.503