<a href="https://colab.research.google.com/github/ElPapi42/AgeClassifier/blob/master/AgeClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from IPython.display import clear_output
!pip install --upgrade tensorflow-gpu
!pip install --upgrade tqdm
!pip install --upgrade pillow
!pip install git+https://github.com/Jwink3101/parmapper
!pip install deepbay
clear_output()

In [0]:
#Imports
import os
import sys
import pathlib

import PIL
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from parmapper import parmap
import deepbay

## Data Download

In [0]:
#Downloads and extract Dataset to local
#You can run this on google colab for get faster downloads speeds
import zipfile
import requests
from tqdm import tqdm

folder_path = "./Datasets"

appa_url = "http://158.109.8.102/AppaRealAge/appa-real-release.zip"
appa_path = folder_path + '/appa-real.zip'

#Create Dataset folder if not exists
if(not os.path.exists(folder_path)):
  os.mkdir(folder_path)

#download chalearn appa-real dataset
if(not(os.path.exists(appa_path) and os.path.isfile(appa_path))):
  resp = requests.get(appa_url, stream=True)

  total_size = int(resp.headers.get('content-length', 0))
  block_size = 16384
  t=tqdm(total=total_size, unit='iB', unit_scale=True)
  
  with open(appa_path, "wb") as f:
    for data in resp.iter_content(block_size):
      t.update(len(data))
      f.write(data)
    t.close()
    f.close()

    if total_size != 0 and t.n != total_size:
      print("Download Error")

  #Extract
  with zipfile.ZipFile(appa_path, 'r') as zip:
    zip.extractall(folder_path)
    zip.close()

## Exploratory Data Analysis

In [0]:
# Load Datasets
df_train = pd.read_csv("/content/Datasets/appa-real-release/gt_avg_train.csv")
df_eval = pd.read_csv("/content/Datasets/appa-real-release/gt_avg_valid.csv")
df_test = pd.read_csv("/content/Datasets/appa-real-release/gt_avg_test.csv")
df_all = pd.concat([df_train, df_eval, df_test])

In [0]:
# Lets see some examples
df_all.head()

In [0]:
# Check for nulls and other general info
df_all.info()

In [0]:
# Descriptive stats
df_all.describe()

In [0]:
# Dataset slices sizes
train_size = df_train.count()[0]
eval_size = df_eval.count()[0]
test_size = df_test.count()[0]

print("Train: " + str(train_size))
print("Evaluation: " + str(eval_size))
print("Test: " + str(test_size))

In [0]:
# Lets see the distribution of the apparent age and the real age
sns.distplot(df_all["apparent_age_avg"], bins=100)
sns.distplot(df_all["real_age"], bins=100)

The Distribution of age dont deviate too much between the real and the apparent, we can keep it

In [0]:
# Now we will check the dist between the train, eval and test datasets
plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1, title="Apparent Avg Age")
sns.distplot(df_train["apparent_age_avg"], bins=100)
sns.distplot(df_eval["apparent_age_avg"], bins=100)
sns.distplot(df_test["apparent_age_avg"], bins=100)

plt.subplot(1, 2, 2, title="Real Age")
sns.distplot(df_train["real_age"], bins=100)
sns.distplot(df_eval["real_age"], bins=100)
sns.distplot(df_test["real_age"], bins=100)

plt.show()

This one can be better, dist of test set deviates a bit from train set, but we can keep with this for now. But this visualizations open some questions, how much the avg age deviates from the real age? putting a response here can lead us to detect some possible outliers, because the avg is based on manual reviews from real people that stimates the age of the person they see in the photo. Lets check that

In [0]:
diference = df_all["real_age"] - df_all["apparent_age_avg"]
sns.distplot(diference, bins=50)

We will ignore this deviation for now, but we can keep in mind this for the future

## Preprocessing

In [0]:
# Find URLs
df_train["file_url"] = "./Datasets/appa-real-release/train/" + df_train["file_name"] + "_face.jpg"
df_eval["file_url"] = "./Datasets/appa-real-release/valid/" + df_eval["file_name"] + "_face.jpg"
df_test["file_url"] = "./Datasets/appa-real-release/test/" + df_test["file_name"] + "_face.jpg"
df_train.head()

In [0]:
#Lets see some examples
plt.figure(figsize=(10,10))
for i, path in enumerate(df_train["file_url"].iloc[:9]):
  image = tf.io.read_file(path)
  image = tf.image.decode_image(image)

  datapoint = df_train[df_train["file_url"] == path]
  age = datapoint["apparent_age_avg"].values[0]

  plt.subplot(3,3,i+1, title="age: {age}".format(age=age))
  plt.imshow(image)
plt.show()

## Dataset Creation

In [0]:
# Load Image
def load_image(path, label):
  image = tf.io.read_file(path)
  image = tf.image.decode_image(image)
  image = tf.image.resize_with_pad(image, 256, 256, antialias=True)
  return image, label

# Transform Image
def transform_image(image_batch, label_batch):
  image_batch = image_batch / 255.0
  return image_batch, label_batch

In [0]:
# Create Datasets for train, evaluation and testing
def generate_datasets(train_batch_size):
  # Train
  train_ds = tf.data.Dataset.from_tensor_slices((df_train["file_url"], df_train["apparent_age_avg"]))
  train_ds = train_ds.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  train_ds = train_ds.batch(train_batch_size)
  train_ds = train_ds.map(transform_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  train_df = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

  # Evaluation
  eval_ds = tf.data.Dataset.from_tensor_slices((df_eval["file_url"], df_eval["apparent_age_avg"]))
  eval_ds = eval_ds.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  eval_ds = eval_ds.batch(train_batch_size)
  eval_ds = eval_ds.map(transform_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  eval_ds = eval_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

  # Testing
  test_ds = tf.data.Dataset.from_tensor_slices((df_test["file_url"], df_test["apparent_age_avg"]))
  test_ds = test_ds.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  test_ds = test_ds.batch(train_batch_size)
  test_ds = test_ds.map(transform_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  test_ds = test_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

  return train_ds, eval_ds, test_ds

We can still implement interleave to the input pipeline of the datasets, lets keep that in mind for later

## Model Definition
We will use ResNet50 pretrained on ImageNet as model base

In [0]:
class AgeClassifier(tf.keras.Model):
  """
  Age Classifier

  args:
    output_network (list): Array of ints where every position represents the number of unit in that layer
  
  """

  def __init__(self, output_network):
    super(AgeClassifier, self).__init__()

    self.resnet = tf.keras.applications.ResNet50V2(weights="imagenet", include_top=False)
    self.set_trainable_layers(0)

    self.flat = tf.keras.layers.Flatten()
    self.output_network = deepbay.DeepFeedForward(output_network)

  def set_trainable_layers(self, trainable_layers):
    """
    Update the layers tha can be trained on the resnet50v2 architecture

    args:
      trainable_layers (int): number of layers that can be trained. By default all the layers are not trainable
      example: trainable_layers=5 means only the last 5 layers of ResNet50V2 are trainable, the remaining have freezed weights.
        ResNet50V2 have 190 layers, be careful for not exceed this number

    """

    for layer in self.resnet.layers:
      layer.trainable = False

    for layer in self.resnet.layers[::-1][:trainable_layers]:
      layer.trainable = True

  def call(self, inputs):
    X = self.resnet(inputs)
    X = self.flat(X)
    X = self.output_network(X)
    return X

## Training

In [0]:
# Hyper Parameters
hparams = {
    "batch_size": 32,
    "lr": 0.0001,
    "output_network": [1]#256, 128, 32, 16, 
}

In [0]:
train_ds, eval_ds, test_ds = generate_datasets(hparams["batch_size"])

In [42]:
age_classifier = AgeClassifier(hparams["output_network"])
age_classifier.compile(
    tf.keras.optimizers.Adam(hparams["lr"]),
    tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)
age_classifier.train_on_batch(train_ds.take(1))
age_classifier.summary()

Model: "age_classifier_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50v2 (Model)           (None, None, None, 2048)  23564800  
_________________________________________________________________
flatten_5 (Flatten)          multiple                  0         
_________________________________________________________________
deep_feed_forward_6 (DeepFee multiple                  655361    
Total params: 24,220,161
Trainable params: 393,217
Non-trainable params: 23,826,944
_________________________________________________________________


In [44]:
age_classifier.fit(train_ds, epochs=10, validation_data=eval_ds.take(1))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f72a5e84630>

In [0]:
age_classifier.set_trainable_layers(22)
age_classifier.fit(train_ds, epochs=10, validation_data=eval_ds.take(1))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [36]:
for img, label in train_ds.take(1):
  r = age_classifier(img)

tf.Tensor([    16 131072], shape=(2,), dtype=int32)
tf.Tensor([16  1], shape=(2,), dtype=int32)


In [0]:
for img, label in train_ds.take(1):
  res = age_classifier(img)
  prob = np.reshape(np.amax(res, axis=1), [-1, 1])
  index = np.where(res == prob)
  print(prob)



In [0]:
import time
start_time = time.time()

for img, label in train_ds.take(1):
  pass

print("--- %s seconds ---" % (time.time() - start_time))