In [1]:
# LIBRARIES
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense, Flatten
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import *
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
import os

%matplotlib inline

In [2]:
# Function to get all the gender classes in our dataset and check the gender of every one of the 100 subjects
def get_gender_classes(id_classes):
    gender_classes = []
    gender_dict = {}
    for id_class in id_classes:
        tmp_file_name = 'awe\\' + id_class + '\\annotations.json'
        tmp_file = open(tmp_file_name, 'r')
        file_lines = tmp_file.readlines()
        for line in file_lines:
            if ('"gender"') in line:
                gender = line.split('"gender": ')[1].replace('\n', '').replace('"', '')
                gender_dict[id_class] = gender
                if gender not in gender_classes:
                    gender_classes.append(gender)
    return gender_classes, gender_dict

In [3]:
# Function that converts the ordinal number of some subject to the actual id_class of that subject in the dataset
def int_to_class(integer):
    tmp_class = str(integer)
    for k in range(3 - len(tmp_class)):
        tmp_class = '0' + tmp_class
    return tmp_class

In [4]:
def get_model_name(n):
    return 'model_' + str(n) + '.h5'

In [5]:
cd = os.getcwd()

In [6]:
os.listdir(os.getcwd())

['.ipynb_checkpoints',
 'awe',
 'awe_dataset',
 'ear_recognition.ipynb',
 'ibb_environment.yml',
 'models',
 'README.md',
 'report',
 'test',
 'train']

In [7]:
vgg16_model = keras.applications.vgg16.VGG16()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [8]:
vgg16_model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [9]:
model = keras.Sequential()
for layer in vgg16_model.layers:
    model.add(layer)

In [10]:
model._layers.pop()

<tensorflow.python.keras.layers.core.Dense at 0x1e0e0467c40>

In [11]:
for layer in model.layers:
    layer.trainable = False

In [12]:
model.add(keras.layers.Dense(2, activation='softmax'))

In [13]:
predictions_layer = model._layers[-2]

In [14]:
model._layers.remove(predictions_layer)

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 56, 56, 256)       2

In [16]:
model.compile(Adam(lr=.0001), loss='categorical_crossentropy', metrics=['accuracy', keras.metrics.AUC()])

In [17]:
# Get the different classes by ID in the dataset
id_classes = []
for i in range(1, 101):
    tmp_class = int_to_class(i)
    id_classes.append(tmp_class)

In [18]:
gender_classes, gender_dict = get_gender_classes(id_classes)

In [21]:
# Store the labels ('m' or 'f') for each one of the images from the training set into a dataframe
training_labels = pd.DataFrame()
image_names = os.listdir(cd + '\\train')
for name in image_names:
    new_name = name.split('(')[1].replace(').png', '')
    new_name = int(new_name)
    tmp_class = int_to_class(new_name)
    tmp_series = pd.Series({'image': name, 'label': gender_dict[tmp_class]}).to_frame().T
    training_labels = pd.concat([training_labels, tmp_series])
    
training_labels.index = range(700)
training_labels

Unnamed: 0,image,label
0,01 (1).png,m
1,01 (10).png,m
2,01 (100).png,m
3,01 (11).png,m
4,01 (12).png,m
...,...,...
695,07 (95).png,m
696,07 (96).png,m
697,07 (97).png,m
698,07 (98).png,m


In [31]:
# Prepare training data and initialize kfold cross-validation
train_data = training_labels
Y = train_data['label']
kfold = KFold(n_splits=5)

In [23]:
# Creating an instance of the ImageDataGenerator class
image_data_gen = ImageDataGenerator(width_shift_range=0.1,
                         height_shift_range=0.1,
                         zoom_range=0.3,
                         fill_mode='nearest',
                         horizontal_flip = True,
                         rescale=1./255)

In [32]:
# 5-fold cross-validation of our model
validation_accuracy, validation_loss, validation_auc = [], [], []

fold_var = 1
save_dir = cd + '\\models\\'

for train_index, val_index in kfold.split(np.zeros(700), Y):
    training_data = train_data.iloc[train_index]
    validation_data = train_data.iloc[val_index]
    train_data_generator = image_data_gen.flow_from_dataframe(training_data, directory = cd + '\\train',
                                                              x_col = "image", y_col = "label",
                                                              class_mode = "categorical", target_size=(224, 224),
                                                              shuffle = True)
    valid_data_generator  = image_data_gen.flow_from_dataframe(validation_data, directory = cd + '\\train',
                                                               x_col = "image", y_col = "label",
                                                               class_mode = "categorical", target_size=(224, 224),
                                                               shuffle = True)
    current_model = model
  
    checkpoint = keras.callbacks.ModelCheckpoint(save_dir + get_model_name(fold_var), 
                                                 monitor=['val_accuracy', 'val_auc'], verbose=2, 
                                                 save_best_only=False, mode='max')
    callbacks_list = [checkpoint]
    
    # Training the model
    current_model.fit(train_data_generator,
                      epochs=5,
                      callbacks=callbacks_list,
                      validation_data=valid_data_generator)
  
    current_model.load_weights(cd + "\\models\\model_" + str(fold_var) + ".h5")
    
    # Validating the model
    results = current_model.evaluate(valid_data_generator)
    results = dict(zip(current_model.metrics_names, results))
    
    # Calculation of various metrics for the model
    validation_accuracy.append(results['accuracy'])
    validation_loss.append(results['loss'])
    validation_auc.append(results['auc'])

    K.clear_session()
    
    fold_var += 1

Found 560 validated image filenames belonging to 2 classes.
Found 140 validated image filenames belonging to 2 classes.
Epoch 1/5
Epoch 00001: saving model to C:\Users\User\Desktop\Fakultet\Slikovna_biometrija\Assignment_3\assignment_3_final\models\model_1.h5
Epoch 2/5
Epoch 00002: saving model to C:\Users\User\Desktop\Fakultet\Slikovna_biometrija\Assignment_3\assignment_3_final\models\model_1.h5
Epoch 3/5
Epoch 00003: saving model to C:\Users\User\Desktop\Fakultet\Slikovna_biometrija\Assignment_3\assignment_3_final\models\model_1.h5
Epoch 4/5
Epoch 00004: saving model to C:\Users\User\Desktop\Fakultet\Slikovna_biometrija\Assignment_3\assignment_3_final\models\model_1.h5
Epoch 5/5

KeyboardInterrupt: 

In [23]:
validation_loss

[0.6760216951370239,
 0.6643847823143005,
 0.6459063291549683,
 0.6473250389099121,
 0.6232863664627075]

In [24]:
validation_accuracy

[0.949999988079071,
 0.8999999761581421,
 0.949999988079071,
 0.8357142806053162,
 0.9142857193946838]

In [25]:
validation_auc

[0.9500000476837158,
 0.9007142782211304,
 0.9507142901420593,
 0.8357143402099609,
 0.9136734008789062]

In [25]:
# Store the labels ('m' or 'f') for each one of the images from the test set into a dataframe
test_labels = pd.DataFrame()
image_names = os.listdir(cd + '\\test')
for name in image_names:
    new_name = name.split('(')[1].replace(').png', '')
    new_name = int(new_name)
    tmp_class = int_to_class(new_name)
    tmp_series = pd.Series({'image': name, 'label': gender_dict[tmp_class]}).to_frame().T
    test_labels = pd.concat([test_labels, tmp_series])
    
test_labels.index = range(300)
test_labels

Unnamed: 0,image,label
0,08 (1).png,m
1,08 (10).png,m
2,08 (100).png,m
3,08 (11).png,m
4,08 (12).png,m
...,...,...
295,10 (95).png,m
296,10 (96).png,m
297,10 (97).png,m
298,10 (98).png,m


In [26]:
test_data = test_labels
Y = test_data['label']

In [27]:
test_data_generator = image_data_gen.flow_from_dataframe(test_data, directory = cd + '\\test',
                                                         x_col = "image", y_col = "label",
                                                         class_mode = "categorical", target_size=(224, 224),
                                                         shuffle = True)

Found 300 validated image filenames belonging to 2 classes.


In [29]:
model.load_weights(cd + "\\models\\model_" + str(5) + ".h5")

In [30]:
# Doing prediction on the test data
Y_predicted = model.predict(test_data_generator)

In [31]:
Y_predicted = np.argmax(Y_predicted, axis=1)

In [32]:
Y_predicted = pd.Series(Y_predicted)
Y_predicted[Y_predicted == 'm'] = 1
Y_predicted[Y_predicted == 'f'] = 0
Y_predicted = Y_predicted.astype('int64')

In [28]:
Y[Y == 'm'] = 1
Y[Y == 'f'] = 0
Y = Y.astype('int64')

In [38]:
# Calculation of the accuracy of the model on the test set
test_accuracy = (Y == Y_predicted).sum() / len(Y)
test_accuracy

0.91

In [82]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(Y.values, Y_predicted, pos_label=None)

ValueError: ignored