# Setup

In [0]:
# Python ≥3.5 is required
import sys
sys.path.append("/Users/etienne/Library/Python/3.7/lib/python/site-packages")
assert sys.version_info >= (3, 5)
#hello
import numpy

# Common imports
import numpy as np
import os

# Scikit-Learn ≥0.20 is required
import sklearn   
assert sklearn.__version__ >= "0.20"

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

import tensorflow as tf
from tensorflow import keras

import IPython.display as display
from PIL import Image

import cv2
from tqdm import tqdm

from scipy.io import loadmat
import scipy.io as spio

from functools import partial

from google.colab import files

import pandas as pd

# Import Data 

### Mount Google Drive (to get access to images and meta data)

In [3]:
#download tar file
#!wget "https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_crop.tar"

--2020-04-18 20:02:28--  https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_crop.tar
Resolving data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)... 129.132.52.162
Connecting to data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)|129.132.52.162|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7012157440 (6.5G) [application/x-tar]
Saving to: ‘imdb_crop.tar’

imdb_crop.tar         2%[                    ] 148.38M  11.7MB/s    eta 9m 8s  

KeyboardInterrupt: ignored

In [0]:
#!tar -xvf imdb_crop.tar

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd /content/drive/My Drive/Colab Notebooks/AgePredictor/AgePredictor/AgePredictorCNN

### Load meta data with .mat file

In [0]:
# Convert .mat file to python file
matfile = spio.loadmat("/content/drive/My Drive/Colab Notebooks/AgePredictor/AgePredictor/AgePredictorCNN/imdb.mat", squeeze_me=True)

In [0]:
# Extract "Date of Birth" and "Date of photo taken" from the .mat file
date_of_birth = matfile['imdb']['dob'][()]
photo_taken = matfile['imdb']['photo_taken'][()]

In [0]:
# convert Date of birth from days to years
year_of_birth = date_of_birth / 365
year_of_birth = year_of_birth.astype(int)
print(year_of_birth[:10])

In [0]:
# calculate labels (age of people) -> photo_taken - year_of_birth
ages = photo_taken - year_of_birth
ages = np.asarray(ages).astype('int32')
print(ages[:6])

In [0]:
# convert .mat file into dictionary
mat_dict = {
    'date_of_birth'     : matfile['imdb']['dob'][()],
    'photo_taken'       : matfile['imdb']['photo_taken'][()],
    'img_path'          : matfile['imdb']['full_path'][()],
    'gender'            : matfile['imdb']['gender'][()],
    'name'              : matfile['imdb']['name'][()],
    'face_location'     : matfile['imdb']['face_location'][()],
    'face_score'        : matfile['imdb']['face_score'][()],        # detector score (the higher the better), 'inf' = no face detected
    'second_face_score' : matfile['imdb']['second_face_score'][()],
    'age'               : ages,
}

In [0]:
# convert dictionary 'mat_dict' into panda datafram
mat_pd = pd.DataFrame(data=mat_dict)

### Explore meta data

In [0]:
mat_pd.head()

In [0]:
mat_pd.info()

In [0]:
mat_pd.describe()

### Remove useless data

In [0]:
print("images with negative ages: ",    mat_pd[mat_pd['age'] < 0].count().max())
print("images with a second face: ",    mat_pd[mat_pd['second_face_score'].notnull()].count().max())
print("images without any face: ",        mat_pd[mat_pd['face_score'] == -np.inf].count().max())
print("images taken before 2000: ",        mat_pd[mat_pd['photo_taken'] < 2000].count().max())

In [0]:
def filter_metadata(data_pd):
    neg_age_indices = data_pd[data_pd['age'] < 0].index                             # get indices of rows with negative age
    new_data = data_pd.drop(neg_age_indices)                                        # delete rows with negative age
    
    second_face_indices = new_data[new_data['second_face_score'].notnull()].index
    new_data = new_data.drop(second_face_indices)                                   # delete rows with 'second face scores'

    face_score_indices = new_data[new_data['face_score'] == -np.inf].index
    new_data = new_data.drop(face_score_indices)                                    # delete rows with no face score

    old_indices = new_data[new_data['photo_taken'] < 2000].index
    new_data = new_data.drop(old_indices)                                           # delete rows where photo is taken before 2000     

    return new_data

In [0]:
mat_pd_trim = filter_metadata(mat_pd)

In [0]:
print("images with negative ages: ",    mat_pd_trim[mat_pd_trim['age'] < 0].count().max())
print("images with a second face: ",    mat_pd_trim[mat_pd_trim['second_face_score'].notnull()].count().max())
print("images without any face: ",        mat_pd_trim[mat_pd_trim['face_score'] == -np.inf].count().max())
print("images without any face: ",        mat_pd_trim[mat_pd_trim['photo_taken'] < 2000].count().max())

In [0]:
mat_pd_trim.info()

### Get even distribution of data

In [0]:
mat_pd_trim['age'].mean()

### Load Images from directories with label paths

In [0]:
data_size = 10000
training_data = []
mat_pd_numpy = mat_pd_trim['img_path'].to_numpy()
mat_pd_str = [str(x) for x in test]
img_path_small = mat_pd_str[:data_size] # use only subset of data, to speed workflow
DATADIR = "/content/drive/My Drive/Colab Notebooks/AgePredictor/AgePredictor/AgePredictorCNN/imdb_crop"
IMG_SIZE = 224

def create_training_data():
    for path in tqdm(img_path_small):  # iterate through each path given in .mat file

        path = os.path.join(DATADIR,path)  # create fullpath to last folder
        
        try:
            img_array = cv2.imread(path) # convert image path to array
            img_array_color = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) # convert bgr image to rgb image
            new_array = cv2.resize(img_array_color, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size
            training_data.append([new_array])   # add image array to our training_data
        
        except Exception as e: 
                pass
            
create_training_data()
print(len(training_data))

In [0]:
a = [[[1, 1, 1]], [[4, 4, 4]]] # [1, 1, 1] is one image -> representation of training_data
# Take (unpack) each image out of its own array
training_data_final = []
for img in training_data:
    training_data_final.append(img[0])

In [0]:
labels = mat_pd_trim['age'].to_numpy()
len(labels)

### Visualize Data

In [0]:
def show_batch(image_batch, label_batch):
  plt.figure(figsize=(10,10))
  for n in range(25):
      ax = plt.subplot(5,5,n+1)
      plt.imshow(image_batch[n])
      plt.title(label_batch[n])
      plt.axis('off')

In [0]:
labels_small = labels[:data_size] 
show_batch(training_data_final[500:], labels[500:])

### Split Training and Test Data

In [0]:
X_train, X_valid, X_test = training_data_final[:int(data_size * 0.75)], training_data_final[int(data_size * 0.75):int(data_size * 0.90)], training_data_final[int(data_size * 0.90):]
y_train, y_valid, y_test = labels_small       [:int(data_size * 0.75)], labels_small       [int(data_size * 0.75):int(data_size * 0.90)], labels_small       [int(data_size * 0.90):]
classes = [str(x) for x in range(1,102)] # number of output neurons

In [0]:
# Combine images with labels and convert list to tf.datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)) 
test_dataset  = tf.data.Dataset.from_tensor_slices((X_test,  y_test)) 

In [0]:
#train_dataset = tf.data.Dataset.from_tensors((X_train, y_train))
#valid_dataset = tf.data.Dataset.from_tensors((X_valid, y_valid)) 
#test_dataset  = tf.data.Dataset.from_tensors((X_test,  y_test)) 

In [0]:
#train_zip = tf.data.Dataset.zip((X_train, y_train))
#batched_dataset = dataset.batch(4, drop_remainder=True)

In [0]:
def preprocess(image, label):
    label = 100 if label > 100 else label
    label = 1 if label < 1 else label
    resized_image = tf.image.resize(image, [224, 224])
    final_image = keras.applications.xception.preprocess_input(resized_image)
    #label = np.asarray(label).astype('float32')
    return final_image, label

In [0]:
print(train_dataset)
test = train_dataset.take(1).map(preprocess)

In [0]:
dataset_size = len(training_data)
batch_size = 32
train_set = train_dataset.shuffle(1000).repeat()
train_set = train_set.map(preprocess).batch(batch_size).prefetch(1)
valid_set = valid_dataset.map(preprocess).batch(batch_size).prefetch(1)
test_set  = test_dataset.map(preprocess).batch(batch_size).prefetch(1)

In [0]:
del train_dataset
del valid_dataset
del test_dataset

# Train Models

### Classification Model

In [0]:
#base_model = keras.applications.xception.Xception(weights="imagenet",
#                                                 include_top=False)
#avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
#output = keras.layers.Dense(len(classes), activation="softmax")(avg)
#model = keras.models.Model(inputs=base_model.input, outputs=output)

In [0]:
# freeze weights of pretrained layers at the beginning of training
#for layer in base_model.layers:
#    layer.trainable = False

#optimizer = keras.optimizers.SGD(lr=0.2, momentum=0.9, decay=0.01)
#model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
#              metrics=["accuracy"])
#history = model.fit(train_set,
#                    steps_per_epoch=int(0.75 * dataset_size / batch_size),
#                    validation_data=valid_set,
#                    validation_steps=int(0.15 * dataset_size / batch_size),
#                    epochs=5)

In [0]:
# unfreeze weights of pretrained layers, also decrease lr so that weights of pretrained layers don't get destroyed
#for layer in base_model.layers:
#    layer.trainable = True

#optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True, decay=0.001)
#model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
#              metrics=["accuracy"])
#history = model.fit(train_set,
#                    steps_per_epoch=int(0.75 * dataset_size / batch_size),
#                    validation_data=valid_set,
#                    validation_steps=int(0.15 * dataset_size / batch_size),
#                    epochs=40)

### Regression Model

In [0]:
base_model_reg = keras.applications.xception.Xception(weights="imagenet",
                                                  include_top=False)
avg_reg = keras.layers.GlobalAveragePooling2D()(base_model_reg.output)
flatten = keras.layers.Flatten()(avg_reg)
hidden = keras.layers.Dense(128, activation="relu")(flatten)
output_reg = keras.layers.Dense(1)(hidden)
model_reg = keras.models.Model(inputs=base_model_reg.input, outputs=output_reg)

In [0]:
# freeze weights of pretrained layers at the beginning of training
for layer in base_model_reg.layers:
    layer.trainable = False

optimizer = keras.optimizers.RMSprop(lr=0.2, momentum=0.9, decay=0.01)
model_reg.compile(loss="mse", 
                  optimizer=optimizer,
                  metrics=["mae"])
history = model_reg.fit(train_set,
                        steps_per_epoch=int(0.75 * dataset_size / batch_size),
                        validation_data=valid_set,
                        validation_steps=int(0.15 * dataset_size / batch_size),
                        epochs=5)

In [0]:
# unfreeze weights of pretrained layers, also decrease lr so that weights of pretrained layers don't get destroyed
for layer in base_model_reg.layers:
    layer.trainable = True

early_stopping_cb = keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True)
optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True, decay=0.001)
model_reg.compile(loss="mse", optimizer=optimizer,
              metrics=["mae"])
history = model_reg.fit(train_set,
                    steps_per_epoch=int(0.75 * dataset_size / batch_size),
                    validation_data=valid_set,
                    validation_steps=int(0.15 * dataset_size / batch_size),
                    callbacks=[early_stopping_cb],
                    epochs=40)

### Evaluating Model

In [0]:
model_reg.save("Regression_Transfer")

In [0]:
from sklearn.metrics import mean_squared_error

test_predictions = model_reg.predict(test_set)
final_mse = mean_squared_error(y_test, test_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

In [0]:
img_index = 6

print(test_predictions[img_index])
print(y_test[img_index])
imgplot = plt.imshow(X_test[img_index])