In [1]:
# Imports
# Please refer to requirements.txt for a full list of all libraries and their versions used in this project.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters.rank import entropy
from skimage.morphology import disk
from skimage.filters import gaussian, sobel
from skimage.feature import canny

import os
from zipfile import ZipFile
import time
from datetime import datetime

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
ROOT_DIR = os.getcwd()
child_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
print(ROOT_DIR)

D:\Code\deep-age-detec\Age_Classification_with_Faces\code


## Dataset Preparation

In [7]:
# Unzipping the dataset file combined_faces.zip

combined_faces_zip_path = ROOT_DIR + r"\ZIPPED_DATASETS\combined_faces.zip"

with ZipFile(combined_faces_zip_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping combined_faces.zip')

Done unzipping combined_faces.zip


In [16]:
# Accessing all image file names.

combined_faces_path = ROOT_DIR + r"/combined_faces"
combined_faces_image_names = os.listdir(combined_faces_path)

['100_1.jpg',
 '100_10.jpg',
 '100_11.jpg',
 '100_12.jpg',
 '100_13.jpg',
 '100_2.jpg',
 '100_3.jpg',
 '100_4.jpg',
 '100_5.jpg',
 '100_6.jpg',
 '100_7.jpg',
 '100_8.jpg',
 '100_9.jpg',
 '101_1.jpg',
 '101_2.jpg',
 '101_3.jpg',
 '103_1.jpg',
 '105_1.jpg',
 '105_2.jpg',
 '105_3.jpg',
 '105_4.jpg',
 '105_5.jpg',
 '10_1.jpg',
 '10_10.jpg',
 '10_100.jpg',
 '10_101.jpg',
 '10_102.jpg',
 '10_103.jpg',
 '10_104.jpg',
 '10_105.jpg',
 '10_106.jpg',
 '10_107.jpg',
 '10_108.jpg',
 '10_109.jpg',
 '10_11.jpg',
 '10_110.jpg',
 '10_111.jpg',
 '10_112.jpg',
 '10_113.jpg',
 '10_114.jpg',
 '10_115.jpg',
 '10_116.jpg',
 '10_117.jpg',
 '10_118.jpg',
 '10_119.jpg',
 '10_12.jpg',
 '10_120.jpg',
 '10_121.jpg',
 '10_122.jpg',
 '10_123.jpg',
 '10_124.jpg',
 '10_125.jpg',
 '10_126.jpg',
 '10_127.jpg',
 '10_128.jpg',
 '10_129.jpg',
 '10_13.jpg',
 '10_130.jpg',
 '10_131.jpg',
 '10_132.jpg',
 '10_133.jpg',
 '10_134.jpg',
 '10_135.jpg',
 '10_136.jpg',
 '10_137.jpg',
 '10_138.jpg',
 '10_139.jpg',
 '10_14.jpg',
 '10_

In [17]:
len(combined_faces_image_names)

33486

### Train and Test Splitting

Before performing any form of classification or feature extraction on the images, it is necessary to **split the *combined_faces* dataset into training and testing datasets**. Separate features extraction will then be done for the training and testing datasets.

In [18]:
combined_classes = pd.read_csv(child_dir + r"/input_output/combined_faces_classes_summary.csv")
combined_classes

Unnamed: 0,Class label,Age-ranges (classes),No. of images,Class balance (%)
0,0,1 - 2,3192,9.53
1,1,3 - 9,2816,8.41
2,2,10 - 20,3136,9.37
3,3,21 - 25,3474,10.37
4,4,26 - 27,3217,9.61
5,5,28 - 31,3063,9.15
6,6,32 - 36,3086,9.22
7,7,37 - 45,3207,9.58
8,8,46 - 54,2802,8.37
9,9,55 - 65,2796,8.35


In [19]:
# Defining a function to return the class labels corresponding to the age-ranges shown above.

def class_labels(age):
    if 1 <= age <= 2:
        return 0
    elif 3 <= age <= 9:
        return 1
    elif 10 <= age <= 20:
        return 2
    elif 21 <= age <= 25:
        return 3
    elif 26 <= age <= 27:
        return 4
    elif 28 <= age <= 31:
        return 5
    elif 32 <= age <= 36:
        return 6
    elif 37 <= age <= 45:
        return 7
    elif 46 <= age <= 54:
        return 8
    elif 55 <= age <= 65:
        return 9
    else:
        return 10

In [20]:
# Creating a new dataframe to hold all filenames, corresponding ages and class labels.

master_df = pd.DataFrame()
master_df['filename'] = combined_faces_image_names
master_df['age'] = master_df['filename'].map(lambda img_name : np.uint8(img_name.split("_")[0]))
master_df['target'] = master_df['age'].map(class_labels)

master_df.head()

Unnamed: 0,filename,age,target
0,100_1.jpg,100,10
1,100_10.jpg,100,10
2,100_11.jpg,100,10
3,100_12.jpg,100,10
4,100_13.jpg,100,10


In [21]:
# Shuffling the rows of combined_df so as to mix together the rows coming from both subreddit datasets.

master_df = shuffle(master_df, random_state=42).reset_index(drop=True)
master_df.head()

Unnamed: 0,filename,age,target
0,26_223.jpg,26,4
1,47_170.jpg,47,8
2,85_250.jpg,85,10
3,61_196.jpg,61,9
4,25_393.jpg,25,3


In [22]:
# Defining the filenames and ages from above master_df as X, and target as y for splitting into train and test datasets later.

X = master_df[['filename', 'age']]
y = master_df['target']

In [23]:
# Splitting the dataset into training and testing datasets with test_size=0.3 and stratify=y. 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [24]:
X_train.shape

(23440, 2)

In [25]:
X_train.head()

Unnamed: 0,filename,age
25853,10_24.jpg,10
29314,4_115.jpg,4
12790,6_58.jpg,6
28224,40_23.jpg,40
31700,22_506.jpg,22


In [26]:
X_test.shape

(10046, 2)

In [27]:
X_test.head()

Unnamed: 0,filename,age
6553,3_328.jpg,3
10831,43_37.jpg,43
14174,38_207.jpg,38
5438,32_312.jpg,32
27785,7_244.jpg,7


In [28]:
y_train.value_counts(normalize=True)

3     0.103754
4     0.096075
7     0.095776
0     0.095307
2     0.093643
6     0.092150
5     0.091468
1     0.084087
8     0.083703
9     0.083490
10    0.080546
Name: target, dtype: float64

In [29]:
y_test.value_counts(normalize=True)

3     0.103723
4     0.096058
7     0.095760
0     0.095361
2     0.093669
6     0.092176
5     0.091479
1     0.084113
8     0.083615
9     0.083516
10    0.080530
Name: target, dtype: float64

In [30]:
# Creating copies of X and y (both train and test) from above to create a dataframe of filepaths to all images and their target labels.
# These dataframes will be in the deep learning models later to create dataset input pipelines using TensorFlow.data.Dataset API.

temp_X_train = X_train.copy()
temp_X_train['target'] = y_train

temp_X_test = X_test.copy()
temp_X_test['target'] = y_test

In [31]:
# Defining a function to append the filepath to each image name as a string.

combined_faces_path = ROOT_DIR + r"/combined_faces"

def append_path_to_filename(filename):
    return os.path.join(combined_faces_path, filename)

In [32]:
# Mapping the above created function on both dataframes created above.

temp_X_train['filename'] = temp_X_train['filename'].map(append_path_to_filename)
temp_X_test['filename'] = temp_X_test['filename'].map(append_path_to_filename)

In [33]:
# Exporting the above created dataframes as CSV files.

temp_X_train.to_csv(child_dir + "/input_output/images_filenames_labels_train.csv", index=False)
temp_X_test.to_csv(child_dir + "/input_output/images_filenames_labels_test.csv", index=False)

## Traditional ML: Feature Extraction using Image Filters

In [34]:
def give_col_image(path):
    img = cv2.imread(path)
    col_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    return col_img

In [35]:
def give_gray_image(col_img):
    gray_img = cv2.cvtColor(col_img, cv2.COLOR_BGR2GRAY)
    
    return gray_img

In [36]:
def give_gaussian_image(gray_img):
    gaussian_img = gaussian(gray_img, sigma=4)
    
    return gaussian_img

In [37]:
def give_entropy_image(gray_img):
    entropy_img = entropy(gray_img, disk(5))
    
    return entropy_img

In [38]:
def give_gabor_image(gray_img):

    ksize = 5
    sigma = 0.5
    theta = 1*np.pi/4
    lamda = 1*np.pi/4
    gamma = 0.8
    phi = 0

    gabor_kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, phi, ktype=cv2.CV_32F)
    gabor_img = cv2.filter2D(gray_img, cv2.CV_8UC3, gabor_kernel)
    
    return gabor_img

In [39]:
def give_sobel_image(gray_img):
    sobel_img = sobel(gray_img)
    
    return sobel_img

In [40]:
def give_canny_image(gray_img):
    # canny_img = cv2.Canny(gray_img, 80, 180)
    canny_img = canny(gray_img, sigma=0.9)
    
    return canny_img

### Canny Edges Feature Extraction

In [42]:
# Defining a function to break-down an image of 200x200 pixels into sections of 10x10 pixels each,
# and calculate the mean and stdev of the section.
# Function INPUT: An image of 200x200 pixel size.
# Function OUTPUT: Features array comprising of mean and stdev of 400 sections (10x10 pixels).

def features_grid(img):
    features = np.array([], dtype='uint8')
    section = 1
    
    for y in range(0, img.shape[0], 10):
        for x in range(0, img.shape[1], 10):

            # Cropping the image into a section.
            section_img = img[y:y+10, x:x+10]
            
            # Claculating the mean and stdev of the sectioned image.
            section_mean = np.mean(section_img)
            section_std = np.std(section_img)
            
            # Appending the above calculated values into features array.
            features = np.append(features, [section_mean, section_std])
    
    # Returning the features array.
    return features

In [43]:
# Defining a function to loop through images in the dataset and extract the canny edges mean and stdev values from 10x10 pixel sections of each image.

def extract_canny_edges(filename_series):

    # Creating an array of shape (1, 801) to store 400 canny edges mean values, 400 canny edges stdev values and 1 age value.
    all_imgs = np.zeros((1, 801), dtype='uint8')

    progress_counter = 0

    for img_name in filename_series:
        
        # Defining a path to the image and reading in the coloured image.
        img_path = os.path.join(combined_faces_path, img_name)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # Converting the coloured image to a grayscale image.
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Converting the grayscale image to a canny edges filtered image.
        img = canny(img, sigma=0.9)
        
        # Using the function defined above, extracting the features (mean and stdev values of all 10x10 pixel sections from the image) from the canny edges filtered image.
        img_features = features_grid(img)

        # Adding the actual age value (from the image name) into the features array.
        age = np.uint8(img_name.split("_")[0])
        img_features = np.append(img_features, age)

        img_features = img_features.reshape(1, img_features.shape[0])

        # Adding the image's features into the all_imgs features array defined above.
        all_imgs = np.append(all_imgs, img_features, axis=0)

        # Keeping track of progress and printing relevant statements for the user.
        progress_counter += 1
        if progress_counter % 1000 == 0:
            print(f"Images processed for features extraction: {progress_counter} of {len(filename_series)}")

    # Getting rid of the first row of zeros created while defining the all_imgs array above.    
    all_imgs = all_imgs[1:]

    return all_imgs

In [44]:
# Extracting the canny edge features from images in the training dataset.

train_imgs = extract_canny_edges(X_train['filename'])

Images processed for features extraction: 1000 of 23440
Images processed for features extraction: 2000 of 23440
Images processed for features extraction: 3000 of 23440
Images processed for features extraction: 4000 of 23440
Images processed for features extraction: 5000 of 23440
Images processed for features extraction: 6000 of 23440
Images processed for features extraction: 7000 of 23440
Images processed for features extraction: 8000 of 23440
Images processed for features extraction: 9000 of 23440
Images processed for features extraction: 10000 of 23440
Images processed for features extraction: 11000 of 23440
Images processed for features extraction: 12000 of 23440
Images processed for features extraction: 13000 of 23440
Images processed for features extraction: 14000 of 23440
Images processed for features extraction: 15000 of 23440
Images processed for features extraction: 16000 of 23440
Images processed for features extraction: 17000 of 23440
Images processed for features extraction

In [45]:
train_imgs.shape

(23440, 801)

In [46]:
# Exporting the above created features array as a .npy file for use in the model later.

with open(child_dir + r"/input_output/canny_features_age_train.npy", "wb") as f:
    np.save(f, train_imgs, allow_pickle=True)

In [47]:
# Extracting the canny edge features from images in the testing dataset.

test_imgs = extract_canny_edges(X_test['filename'])

Images processed for features extraction: 1000 of 10046
Images processed for features extraction: 2000 of 10046
Images processed for features extraction: 3000 of 10046
Images processed for features extraction: 4000 of 10046
Images processed for features extraction: 5000 of 10046
Images processed for features extraction: 6000 of 10046
Images processed for features extraction: 7000 of 10046
Images processed for features extraction: 8000 of 10046
Images processed for features extraction: 9000 of 10046
Images processed for features extraction: 10000 of 10046


In [48]:
test_imgs.shape

(10046, 801)

In [49]:
# Exporting the above created features array as a .npy file for use in the model later.

with open(child_dir + r"/input_output/canny_features_age_test.npy", "wb") as f:
    np.save(f, test_imgs, allow_pickle=True)

In [50]:
# Creating a list of columns names for the features arrays defined above.
# The column names correspond to the sectioned image's mean and stdev values.
# Last column is the age to be converted to target class label in the model later.

feature_names = []
section = 1
    
for y in range(0, 200, 10):
    for x in range(0, 200, 10):
        feature_names.append(f"sec{section}_mean")
        feature_names.append(f"sec{section}_std")
        section += 1

feature_names.append('age')

In [51]:
feature_names[:10]

['sec1_mean',
 'sec1_std',
 'sec2_mean',
 'sec2_std',
 'sec3_mean',
 'sec3_std',
 'sec4_mean',
 'sec4_std',
 'sec5_mean',
 'sec5_std']

In [52]:
feature_names[-10:]

['sec396_std',
 'sec397_mean',
 'sec397_std',
 'sec398_mean',
 'sec398_std',
 'sec399_mean',
 'sec399_std',
 'sec400_mean',
 'sec400_std',
 'age']

In [53]:
len(feature_names)

801

In [54]:
# Exporting the above created list of feature names as a CSV file for use in the model later.

pd.Series(feature_names).to_csv(child_dir + "/input_output/canny_features_names.csv", index=False, header=['canny_edge_features'])