In [None]:
# Imports
# Please refer to requirements.txt for a full list of all libraries and their versions used in this project.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import os
# from zipfile import ZipFile
import time
from datetime import datetime
import itertools

import pickle

In [None]:
ROOT_DIR = os.getcwd()
child_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

print(ROOT_DIR)

## Traditional ML: Data Importing

In [None]:
# Importing the table showing the breakdown of age-ranges into classes.

combined_classes = pd.read_csv(child_dir + r"/input_output/combined_faces_classes_summary.csv")
combined_classes

In [None]:
# Defining a function to return the class labels corresponding to the age-ranges shown above.

def class_labels(age):
    if 1 <= age <= 2:
        return 1
    elif 3 <= age <= 9:
        return 2
    elif 10 <= age <= 20:
        return 3
    elif 21 <= age <= 25:
        return 4
    elif 26 <= age <= 27:
        return 5
    elif 28 <= age <= 31:
        return 6
    elif 32 <= age <= 36:
        return 7
    elif 37 <= age <= 45:
        return 8
    elif 46 <= age <= 54:
        return 9
    elif 55 <= age <= 65:
        return 10
    else:
        return 11

In [None]:
# Importing the feature names.

feature_names = pd.read_csv(child_dir + r"/input_output/canny_features_names.csv")
feature_names

In [None]:
# Importing the numpy arrays of train and test datasets.

train = np.load(child_dir + r"/input_output/canny_features_age_train.npy")
test = np.load(child_dir + r"/input_output/canny_features_age_test.npy")

In [None]:
# Converting the numpy arrays to pandas dataframe.

train_df = pd.DataFrame(train, columns=feature_names["canny_edge_features"])
test_df = pd.DataFrame(test, columns=feature_names["canny_edge_features"])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.dtypes.unique()

In [None]:
test_df.dtypes.unique()

In [None]:
train_df['age'] = train_df['age'].astype(np.uint8)
test_df['age'] = test_df['age'].astype(np.uint8)

In [None]:
# Creating a column of target class values using the function defined above.

train_df['target'] = train_df['age'].map(class_labels)
test_df['target'] = test_df['age'].map(class_labels)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Traditional ML: Model Preparation

In [None]:
# Splitting the above train and test dataframes into features (X) and target (y).

X_train = train_df.drop(columns=['age', 'target'])
y_train = train_df['target']

X_test = test_df.drop(columns=['age', 'target'])
y_test = test_df['target']

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_test.head()

In [None]:
X_test.shape

In [None]:
# Checking the distribution of classes in y_train.

y_train.value_counts()

In [None]:
# Checking the distribution of classes to ensure it is same as y_test.

y_train.value_counts(normalize=True)

In [None]:
# Checking the distribution of classes to ensure it is same as y_train.

y_test.value_counts(normalize=True)

In [None]:
# Scaling X_train to the standard scale.

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)

In [None]:
# Transforming X_test to the same scale.

X_test_sc = ss.transform(X_test)

## Traditional ML: Classification Modelling

### *GridSearchCV* with *RandomForestClassifier*

In [None]:
# Creating a RandomForestClassifier object.

rfc = RandomForestClassifier(# class_weight='balanced_subsample', 
                             # n_estimators=200,
                             # max_depth=5,
                             ccp_alpha=0,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             random_state=42
                            )

In [None]:
# Establishing ranges of hyperparameters of RandomForestClassifier for GridSearchCV.

rfc_params = {'n_estimators' : [50, 100, 200],
              'max_depth' : [5, 7, 9],
              # 'ccp_alpha' : [0, 0.001, 0.01, 0.1, 1, 10],
              # 'min_samples_split' : [2, 5, 10, 15, 20],
              # 'min_samples_leaf' : [2, 3, 4, 5, 6]
             }

In [None]:
# Creating a GridSearchCV object for the RandomForestClassifier object defined above.

rfc_gs = GridSearchCV(rfc, param_grid=rfc_params, n_jobs=-1, cv=5)

In [None]:
# Fitting X_train_sc and y_train on GridSearchCV object with RandomForestClassifier defined above.

rfc_gs.fit(X_train_sc, y_train)

In [None]:
# Best combination of hyperparameters suggested by GridSearchCV.

rfc_gs.best_params_

In [None]:
# Best accuracy score obtained by the above combination of hyperparameters.

rfc_gs.best_score_

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

rfc_train_acc = rfc_gs.score(X_train_sc, y_train)
rfc_train_acc

In [None]:
# Estimated Testing Accuracy

# rfc_est_test_acc = cross_val_score(rfc_gs, X_train_sc, y_train, cv=5).mean()
# rfc_est_test_acc

In [None]:
# Actual Testing Accuracy

rfc_test_acc = rfc_gs.score(X_test_sc, y_test)
rfc_test_acc

In [None]:
# Summary scores from GridSearchCV with RandomForestClassifier.

print("RandomForestClassifier summary of accuracy scores:")
print(f"GridSearchCV best accuracy (cv=5) = {round(rfc_gs.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training accuracy = {round(rfc_train_acc, 3)}")
# print(f"Est. Test accuracy (cv=5) = {round(rfc_est_test_acc , 3)}")
print(f"Testing accuracy = {round(rfc_test_acc, 3)}")

In [None]:
# Generating predictions on testing dataset using the model above.

rfc_pred = rfc_gs.predict(X_test_sc)

In [None]:
len(rfc_pred)

In [None]:
len(y_test)

In [None]:
# Generating a confusion matrix based on above predictions.

conf_mat_rfc = confusion_matrix(y_test, rfc_pred)
conf_mat_rfc

In [None]:
# Defining a function to plot the confusion matrix in a grid for easier visualization.

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', export_as='confusion_matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=16)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True labels', fontsize=14)
    plt.xlabel('Predicted labels', fontsize=14)

    # Exporting plot image in PNG format.
    plt.savefig(child_dir + r'/plot_images/{export_as}.png', bbox_inches='tight');

In [None]:
# Plotting the NORMALIZED confusion matrix using the function defined above.

cm_plot_labels = combined_classes['Age-ranges (classes)']

plt.figure(figsize=(16,8))
plot_confusion_matrix(conf_mat_rfc, cm_plot_labels, normalize=True,
                      title="Confusion Matrix based on predictions from\nRandomForestClassifier model using Canny Edge features",
                      export_as="rfc_canny_conf_mat_norm"
                     )

plt.show()

In [None]:
# Plotting the confusion matrix using the function defined above.

cm_plot_labels = combined_classes['Age-ranges (classes)']

plt.figure(figsize=(16,8))
plot_confusion_matrix(conf_mat_rfc, cm_plot_labels, normalize=False,
                      title="Confusion Matrix based on predictions from\nRandomForestClassifier model using Canny Edge features",
                      export_as="rfc_canny_conf_mat"
                     )

plt.show()

The ***RandomForestClassifier* model** above, with a training accuracy of 66.8% and a testing accuracy of 39.8%, clearly shows that it is **over-fitting and not generalizing well on unseen testing data**.

The **normalized confusion matrix** above also show this clearly — even though the accuracy values are somewhat high for the younger age-ranges (of 1–2, 3–9, 10–20 and 21–25) and for the older age ranges (of 66–116), there is a **presence of significant misclassification for the middle age-ranges of 26–65**.

In [None]:
# Saving the RandomForestClassifier model from above in a pickle file for possible use later.

rfc_pickle = child_dir + r"/input_output/rfc_canny_model_acc_{round(rfc_test_acc, 3)}"
pickle.dump(rfc_gs, open(rfc_pickle, 'wb'))

### *GridSearchCV* with *SVC*

In [None]:
# Creating a SVC object.

svc = SVC(# class_weight='balanced',
          # C=1.0,
          # kernel='rbf',
          # degree=3,
          random_state=42
         )

In [None]:
# Establishing ranges of hyperparameters of SVC for GridSearchCV.

svc_params = {'C' : [0.001, 1],
              'kernel' : ['rbf', 'poly', 'linear'],
              'degree' : [3, 5]
             }

In [None]:
# Creating a GridSearchCV object for the SVC object defined above.

svc_gs = GridSearchCV(svc, param_grid=svc_params, n_jobs=-1, cv=5)

In [None]:
# Fitting X_train_sc and y_train on GridSearchCV object with SVC defined above.

svc_gs.fit(X_train_sc, y_train)

In [None]:
# Best combination of hyperparameters suggested by GridSearchCV.

svc_gs.best_params_

In [None]:
# Best accuracy score obtained by the above combination of hyperparameters.

svc_gs.best_score_

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

svc_train_acc = svc_gs.score(X_train_sc, y_train)
svc_train_acc

In [None]:
# Estimated Testing Accuracy

# svc_est_test_acc = cross_val_score(svc_gs, X_train_sc, y_train, cv=5).mean()
# svc_est_test_acc

In [None]:
# Actual Testing Accuracy

svc_test_acc = svc_gs.score(X_test_sc, y_test)
svc_test_acc

In [None]:
# Summary scores from GridSearchCV with SVC.

print("SVC summary of accuracy scores:")
print(f"GridSearchCV best accuracy = {round(svc_gs.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training accuracy = {round(svc_train_acc, 3)}")
# print(f"Est. Test accuracy (cv=5) = {round(svc_est_test_acc , 3)}")
print(f"Testing accuracy = {round(svc_test_acc, 3)}")

In [None]:
# Generating predictions on testing dataset using the model above.

svc_pred = svc_gs.predict(X_test_sc)

In [None]:
len(svc_pred)

In [None]:
len(y_test)

In [None]:
# Generating a confusion matrix based on above predictions.

conf_mat_svc = confusion_matrix(y_test, svc_pred)
conf_mat_svc

In [None]:
# Plotting the NORMALIZED confusion matrix using the function defined above.

cm_plot_labels = combined_classes['Age-ranges (classes)']

plt.figure(figsize=(16,8))
plot_confusion_matrix(conf_mat_svc, cm_plot_labels, normalize=True,
                      title="Confusion Matrix based on predictions from\nSVC model using Canny Edge features",
                      export_as="svc_canny_conf_mat_norm"
                     )

plt.show()

In [None]:
# Plotting the confusion matrix using the function defined above.

cm_plot_labels = combined_classes['Age-ranges (classes)']

plt.figure(figsize=(16,8))
plot_confusion_matrix(conf_mat_svc, cm_plot_labels, normalize=False,
                      title="Confusion Matrix based on predictions from\nSVC model using Canny Edge features",
                      export_as="svc_canny_conf_mat"
                     )

plt.show()

As with the *RandomForestClassifier* model above, the ***SVC* model**, with a training accuracy of 92.9% and a testing accuracy of 53.4%, also shows that it is **over-fitting and not generalizing well on unseen testing data**. Even though the training and testing accuracies are better with *SVC* than with *RandomForestClassifier*, the **degree of over-fit is significantly worse than *RandomForestClassifier***.

The **normalized confusion matrix** above also show the same trend — even though the accuracy values are somewhat high for the younger age-ranges (of 1–2, 3–9, 10–20 and 21–25) and for the older age ranges (of 66–116), there is a **presence of significant misclassification for the middle age-ranges of 26–65**.

In [None]:
# Saving the SVC model from above in a pickle file for possible use later.

svc_pickle = child_dir + r"/input_output/svc_canny_model_acc_{round(svc_test_acc, 3)}"
pickle.dump(svc_gs, open(svc_pickle, 'wb'))

## Traditional ML: Summary and Limitations

![summary_table_traditional_ml.png](https://drive.google.com/uc?export=view&id=1kc7idDm-1QmSTfdmk592NpMXWPU8moGu)

As is clear from the above summary of scores, **modelling using the traditional machine learning methodology may not be the best way to approach this problem**. I will now try modelling with deep learning and see if it provides a better result.

There are, of course, a multitude of methods that could still be utilized to improve the above accuracy scores and reduce the degree of over-fit in the models. For instance, better differentiating features could be extracted from the images using some other more complicated techniques, or other classifiers could be utilized to see whether they perform better in this case.