# Age & Gender Classification on UTK Dataset


The UTKFace dataset is a large-scale face dataset consisting of over 20,000 images. The images cover a wide range of ages and ethnicities and include both males and females.

Using this dataset, I will be attempting to create a model capable of predicting the age and gender of an individual. I will be using part one of the cropped variations of the images, of which there are almost 10,000.

This project constitutes as the partial fulfilment of my Fall Semester 2024 RA-ship under the supervision of Dr. Zubair Khalid, Associate Professor, Syed Babar Ali School of Science and Engineering at Lahore University of Management Sciences.


## Table of Contents

- Imports
- Loading Data
- Visualizing Data
- Train Test Split
- Building the Model
- Training the Model
- Evaluating the Model


## 1. Imports


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from PIL import Image, ImageOps
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense
from keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import warnings

warnings.filterwarnings('ignore')

: 

## 2. Loading Dataset


The `with` statement ensures each file is automatically closed after processing to conserve memory. This approach is useful for large datasets.


In [14]:
images = []
ages = []
genders = []

for i in os.listdir('./Dataset/crop_part1/'):
    split = i.split('_')
    ages.append(int(split[0]))
    genders.append(int(split[1]))
    
    with Image.open('./Dataset/crop_part1/' + i) as img:
        images.append(img.copy()) 

FileNotFoundError: [WinError 3] The system cannot find the path specified: './Dataset/crop_part1/'

In [15]:
images = pd.Series(list(images), name = 'Images')
ages = pd.Series(list(ages), name = 'Ages')
genders = pd.Series(list(genders), name = 'Genders')

df = pd.concat([images, ages, genders], axis=1)
df

Unnamed: 0,Images,Ages,Genders


In [16]:
display(df['Images'][33])
print(df['Ages'][33], df['Genders'][33])

KeyError: 33

In [None]:
display(df['Images'][100])
print(df['Ages'][100], df['Genders'][100])

So `0` corresponds to male and `1` corresponds to female.


## 3. Visualizing Data


In [None]:
df['Genders'].unique()

To be on the safer side, I am going to remove any rows where gender equals 3.


In [None]:
df = df[df['Genders'] != 3]

# Plot Gender Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Genders', data=df, palette='coolwarm')
plt.title('Gender Distribution', fontsize=16)
plt.xlabel('Gender (0 = Male, 1 = Female)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(ticks=[0, 1], labels=['Male', 'Female'])
plt.show()

In [None]:
import plotly.express as px

# Plot Age Distribution
fig = px.histogram(df, x='Ages', nbins=30, title="Age Distribution", 
                   labels={'Age': 'Age'}, 
                   color_discrete_sequence=['skyblue'])

# Customize the plot layout
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Frequency",
    bargap=0.1  # Add a little space between bars
)

fig.show()

As depicted in the plot, there are too many images between the ages of 0-4. The model would fit too well to these ages and not well enough to others. To resolve this, I will include 1/3 of the images between these ages.

It is also best to remove images over 90, since there aren't many such images, and only have the model predict the ages of people under 90.


In [None]:
under4s = []

for i in range(len(df)):
    if df['Ages'].iloc[i] <= 4:
        under4s.append(df.iloc[i])
under4s = pd.DataFrame(under4s)
under4s = under4s.sample(frac=0.3)

df = df[df['Ages'] > 4]

df = pd.concat([df, under4s], ignore_index = True)
df = df[df['Ages'] < 90]

In [None]:
# Plot Age Distribution
fig = px.histogram(df, x='Ages', nbins=30, title="Age Distribution", 
                   labels={'Age': 'Age'}, 
                   color_discrete_sequence=['skyblue'])

# Customize the plot layout
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Frequency",
    bargap=0.1  # Add a little space between bars
)

fig.show()

## 4. Train Test Split


In [None]:
# Preparing Data
x = []
y = []

for i in range(len(df)):
    df['Images'].iloc[i] = df['Images'].iloc[i].resize((200,200), Image.ANTIALIAS)
    ar = np.asarray(df['Images'].iloc[i])
    x.append(ar)
    agegen = [int(df['Ages'].iloc[i]), int(df['Genders'].iloc[i])]
    y.append(agegen)
x = np.array(x)

In [None]:
y_age = df['Ages']
y_gender = df['Genders']

x_train_age, x_test_age, y_train_age, y_test_age = train_test_split(x, y_age, test_size=0.2, stratify=y_age)
x_train_gender, x_test_gender, y_train_gender, y_test_gender = train_test_split(x, y_gender, test_size=0.2, stratify=y_gender)

## 5. Building the Model


I will build two models - an age model to predict continuous values which will be rounded to the nearest integer, and the gender model returning a binary classification label.


In [None]:
agemodel = Sequential()
agemodel.add(Conv2D(32, (3,3), activation='relu', input_shape=(200, 200, 3)))
agemodel.add(MaxPooling2D((2,2)))
agemodel.add(Conv2D(64, (3,3), activation='relu'))
agemodel.add(MaxPooling2D((2,2)))
agemodel.add(Conv2D(128, (3,3), activation='relu'))
agemodel.add(MaxPooling2D((2,2)))
agemodel.add(Flatten())
agemodel.add(Dense(64, activation='relu'))
agemodel.add(Dropout(0.5))
agemodel.add(Dense(1, activation='relu'))

agemodel.compile(loss='mean_squared_error',
             optimizer=optimizers.Adam(lr=0.0001))

genmodel = Sequential()
genmodel.add(Conv2D(32, (3,3), activation='relu', input_shape=(200, 200, 3)))
genmodel.add(MaxPooling2D((2,2)))
genmodel.add(Conv2D(64, (3,3), activation='relu'))
genmodel.add(MaxPooling2D((2,2)))
genmodel.add(Conv2D(128, (3,3), activation='relu'))
genmodel.add(MaxPooling2D((2,2)))
genmodel.add(Flatten())
genmodel.add(Dense(64, activation='relu'))
genmodel.add(Dropout(0.5))
genmodel.add(Dense(1, activation='sigmoid'))

genmodel.compile(loss='binary_crossentropy',
             optimizer=optimizers.Adam(lr=0.0001),
             metrics=['accuracy'])

## 6. Training the Model


In [None]:
datagen = ImageDataGenerator(
      rescale=1./255., width_shift_range = 0.1, height_shift_range = 0.1, horizontal_flip = True)

test_datagen = ImageDataGenerator(rescale=1./255)

train1 = datagen.flow(x_train_age, y_train_age, batch_size=32)

test1 = test_datagen.flow(
        x_test_age, y_test_age,
        batch_size=32)

history1 = agemodel.fit(train1, epochs=50, shuffle=True, validation_data=test1)

In [None]:
datagen = ImageDataGenerator(
      rescale=1./255., width_shift_range = 0.1, height_shift_range = 0.1, horizontal_flip = True)

test_datagen = ImageDataGenerator(rescale=1./255)

train2 = datagen.flow(x_train_gender, y_train_gender, batch_size=64)

test2 = test_datagen.flow(
        x_test_gender, y_test_gender,
        batch_size=64)

history2 = genmodel.fit(train2, epochs=50, shuffle=True, validation_data=test2)

## Evaluating the Model


Age Prediction Model (agemodel): Since this is a regression task, we will use metrics like Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (coefficient of determination).


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

# Rescale test data for evaluation
x_test_age_rescaled = x_test_age / 255.0
x_test_gender_rescaled = x_test_gender / 255.0

# Evaluate Age Prediction Model (Regression)
y_pred_age = agemodel.predict(x_test_age_rescaled)
mae = mean_absolute_error(y_test_age, y_pred_age)
mse = mean_squared_error(y_test_age, y_pred_age)
r2 = r2_score(y_test_age, y_pred_age)

print("Age Prediction Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}\n")

NameError: name 'x_test_age' is not defined

Gender Prediction Model (genmodel): Since this is a binary classification task, we will use metrics like Accuracy, Precision, Recall, F1-score, and AUC-ROC.


In [None]:
# Evaluate Gender Prediction Model (Binary Classification)
y_pred_gender = genmodel.predict(x_test_gender_rescaled)
y_pred_gender_binary = (y_pred_gender > 0.5).astype(int)  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_test_gender, y_pred_gender_binary)
precision = precision_score(y_test_gender, y_pred_gender_binary)
recall = recall_score(y_test_gender, y_pred_gender_binary)
f1 = f1_score(y_test_gender, y_pred_gender_binary)
roc_auc = roc_auc_score(y_test_gender, y_pred_gender)
conf_matrix = confusion_matrix(y_test_gender, y_pred_gender_binary)

print("Gender Prediction Model Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"AUC-ROC: {roc_auc}")
print("Confusion Matrix:")
print(conf_matrix)

Now testing the models on images of celebrities, they have never seen before:


In [None]:
def process_and_predict(file):
    im = Image.open(file)
    width, height = im.size
    if width == height:
        im = im.resize((200,200), Image.ANTIALIAS)
    else:
        if width > height:
            left = width/2 - height/2
            right = width/2 + height/2
            top = 0
            bottom = height
            im = im.crop((left,top,right,bottom))
            im = im.resize((200,200), Image.ANTIALIAS)
        else:
            left = 0
            right = width
            top = 0
            bottom = width
            im = im.crop((left,top,right,bottom))
            im = im.resize((200,200), Image.ANTIALIAS)  
    ar = np.asarray(im)
    ar = ar.astype('float32')
    ar /= 255.0
    ar = ar.reshape(-1, 200, 200, 3)
    age = agemodel.predict(ar)
    gender = np.round(genmodel.predict(ar))
    if gender == 0:
        gender = 'male'
    elif gender == 1:
        gender = 'female'
    print('Age:', int(age), '\n Gender:', gender)
    return im.resize((300,300), Image.ANTIALIAS)

In [None]:
process_and_predict('./testpictures/11.jpg')

In [None]:
process_and_predict('./testpictures/12.jpg')

In [None]:
process_and_predict('./testpictures/14.jpg')

In [None]:
process_and_predict('./testpictures/15.jpg')

In [None]:
process_and_predict('./testpictures/19.jpg')

In [None]:
process_and_predict('./testpictures/20.jpg')

In [None]:
process_and_predict('./testpictures/16.jpg')

In [None]:
process_and_predict('./testpictures/13.jpg')

In [None]:
process_and_predict('./testpictures/17.jpg')

## Saving the Models


In [None]:
# Save the Age Prediction Model
agemodel.save('age_prediction_model.h5')

# Save the Gender Prediction Model
genmodel.save('gender_prediction_model.h5')