In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install 'h5py==2.10.0' --force-reinstall

Collecting h5py==2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.1 MB/s 
[?25hCollecting six
  Downloading six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting numpy>=1.7
  Downloading numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 75 kB/s 
[?25hInstalling collected packages: six, numpy, h5py
  Attempting uninstall: six
    Found existing installation: six 1.15.0
    Uninstalling six-1.15.0:
      Successfully uninstalled six-1.15.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
  Attempting uninstall: h5py
    Found existing installation: h5py 3.1.0
    Uninstalling h5py-3.1.0:
      Successfully uninstalled h5py-3.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are ins

In [3]:
%tensorflow_version 1.x

### Import libs
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
import random
import pprint
import sys
import time
import math
import numpy as np
import pickle
import cv2
import copy
from matplotlib import pyplot as plt
import pandas as pd
import os

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, Dropout, TimeDistributed, Layer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Progbar

from google.colab.patches import cv2_imshow

TensorFlow 1.x selected.


In [9]:
#### Config setting
class Config:

    def __init__(self):
        # Tamaños de anchores
        self.anchor_box_scales = [32, 64, 128]

        # Ratios de anchores
        self.anchor_box_ratios = [[1, 1], [1./math.sqrt(2), 2./math.sqrt(2)], [2./math.sqrt(2), 1./math.sqrt(2)]]

        # Tamaño a redimensionar la dimension más pequeña de la imagen
#        self.im_size = 600

        # numero de ROIs procesados simultáneamente
        self.num_rois = 4

        # stride para el modelo RPN (modelo base VGG16)
        self.rpn_stride = 16

        # scaling the stdev
        self.std_scaling = 4.0
        self.classifier_regr_std = [8.0, 8.0, 4.0, 4.0]

        # threshold para el modelo RPN
        self.rpn_min_overlap = 0.3
        self.rpn_max_overlap = 0.7

        # threshold para el clasificador final
        self.classifier_min_overlap = 0.1
        self.classifier_max_overlap = 0.5

        # codificación de las clases
        self.class_mapping = None
        self.model_path = None

# From https://stackoverflow.com/questions/44650888/resize-an-image-without-distortion-opencv
def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image

    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        r = height / float(h)
        dim = (int(w * r), height)

    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        r = width / float(w)
        dim = (width, int(h * r))

    # resize the image
    resized = cv2.resize(image, dim, interpolation = inter)

    # return the resized image
    return resized

#### Parser los datos del ficheros de anotaciones
def get_data(input_path):
    all_imgs = {} # informacion de las imagenes extraida del fichero de anotaciones y agrupada por imagen
    classes_count = {} # cantidad de objetos de cada clase
    class_mapping = {} # codificacion del nombre de cada clase
    filename_path = "/content/drive/MyDrive/TFM/TrainFasterRCNN/" # Use this as prefix if different from None

    i = 1
    with open(input_path,'r') as f:
        for line in f:
            sys.stdout.write('\r'+'idx=' + str(i))
            i += 1

            line_split = line.strip().split(',')
            # Una misma imagen puede contener varias clases y anotaciones
            (filename,x1,y1,x2,y2,class_name) = line_split

            x1 = int(x1)
            x2 = int(x2)
            y1 = int(y1)
            y2 = int(y2)

            # Change new dataset class name to old one to preserve old training
            if class_name == 'bib_number':
              class_name = 'dorsal'

            if class_name not in classes_count:
                classes_count[class_name] = 1
            else:
                classes_count[class_name] += 1

            if class_name not in class_mapping: # evitar clase con nombre 'bg'
                class_mapping[class_name] = len(class_mapping)

            img = None

            if filename not in all_imgs:
                all_imgs[filename] = {}

                img = cv2.imread(filename if filename_path is None else os.path.join(filename_path, filename))
                
                if img is not None:   
                  img_shape = img.shape[:2]

                  # Reshape image size if it is larger than 1920x1080
                  if img_shape[0] > 1080 or img_shape[1] > 1920:
                    reshape_img = image_resize(img, 1920, 1080)

                    reshaped_img_shape = reshape_img.shape[:2]

                    scale = np.flipud(np.divide(reshaped_img_shape, img_shape))

                    x1, y1 = np.multiply((x1, y1), scale)
                    x2, y2 = np.multiply((x2, y2), scale)

                    x1 = int(x1)
                    x2 = int(x2)
                    y1 = int(y1)
                    y2 = int(y2)

                    img = reshape_img

                  (rows,cols) = img.shape[:2]
                  all_imgs[filename]['filepath'] = filename if filename_path is None else os.path.join(filename_path, filename)
                  all_imgs[filename]['width'] = cols
                  all_imgs[filename]['height'] = rows
                  all_imgs[filename]['bboxes'] = []

                  # cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                  # cv2_imshow(img)

            if img is not None:
              all_imgs[filename]['bboxes'].append({'class': class_name, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2})

        all_data = []
        for key in all_imgs:
            all_data.append(all_imgs[key])

        return all_data, classes_count, class_mapping

#### Definicion ROI Pooling Convolutional Layer
class RoiPoolingConv(Layer):
    def __init__(self, pool_size, num_rois, **kwargs):
        self.dim_ordering = K.image_data_format()
        self.pool_size = pool_size
        self.num_rois = num_rois

        super(RoiPoolingConv, self).__init__(**kwargs)

    def build(self, input_shape):
        self.nb_channels = input_shape[0][3]

    def compute_output_shape(self, input_shape):
        return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels

    def call(self, x, mask=None):
        assert(len(x) == 2)

        # x[0] is image with shape (rows, cols, channels)
        img = x[0]
        # x[1] is roi with shape (num_rois,4) with ordering (x,y,w,h)
        rois = x[1]

        outputs = []
        for roi_idx in range(self.num_rois):
            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]

            x = K.cast(x, 'int32')
            y = K.cast(y, 'int32')
            w = K.cast(w, 'int32')
            h = K.cast(h, 'int32')

            # Resized roi of the image to pooling size (7x7)
            rs = tf.image.resize(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size))
            outputs.append(rs)

        final_output = K.concatenate(outputs, axis=0)

        # Reshape to (1, num_rois, pool_size, pool_size, nb_channels)
        # Might be (1, 4, 7, 7, 3)
        final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))

        final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4))

        return final_output

    def get_config(self):
        config = {'pool_size': self.pool_size, 'num_rois': self.num_rois}
        base_config = super(RoiPoolingConv, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

#### Vgg-16 modelo base

# dimensiones del feature map
def get_img_output_length(width, height):
    def get_output_length(input_length):
        return input_length//16

    return get_output_length(width), get_output_length(height)

# modelo base
def nn_base(input_tensor=None):

    if input_tensor is None:
        img_input = Input(shape=(None, None, 3))
    else:
        img_input = input_tensor

    # Block 1
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1')(img_input)
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)

    # Block 2
    x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)

    # Block 3
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)

    # Block 4
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)

    # Block 5
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)

    return x

####  modelo RPN
def rpn_layer(base_layers, num_anchors):
    x = Conv2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)

    x_class = Conv2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)
    x_regr = Conv2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)

    return [x_class, x_regr, base_layers]

####  modelo clasificador final
def classifier_layer(base_layers, input_rois, num_rois, nb_classes):
    pooling_regions = 7

    # TimeDistributed layers se utiliza para procesar ROIs de forma independiente.
    # Se indica el número de ROIs de entrada añadiendo una dimensión mas (num_rois)
    # out_roi_pool es una lista de 4 RoI (7x7x512)
    # out_roi_pool.shape = (1, num_rois, pool_size, pool_size, channels)
    out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois])

    # Flatten out_roi_pool y conectar a 2 Fully-Connected y 2 dropout layers
    out = TimeDistributed(Flatten(name='flatten'))(out_roi_pool)
    out = TimeDistributed(Dense(4096, activation='relu', name='fc1'))(out)
    out = TimeDistributed(Dropout(0.5))(out)
    out = TimeDistributed(Dense(4096, activation='relu', name='fc2'))(out)
    out = TimeDistributed(Dropout(0.5))(out)

    # out_class: prediccion de la clase del objeto
    out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
    # out_regr: prediccion de las coordenadas de los bboxes
    out_regr = TimeDistributed(Dense(4*(nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)

    return [out_class, out_regr]

#### Calculo IoU (Intersection of Union)
def union(au, bu, area_intersection):
    area_a = (au[2] - au[0]) * (au[3] - au[1])
    area_b = (bu[2] - bu[0]) * (bu[3] - bu[1])
    area_union = area_a + area_b - area_intersection
    return area_union

def intersection(ai, bi):
    x = max(ai[0], bi[0])
    y = max(ai[1], bi[1])
    w = min(ai[2], bi[2]) - x
    h = min(ai[3], bi[3]) - y
    if w < 0 or h < 0:
        return 0
    return w*h

def iou(a, b):
    if a[0] >= a[2] or a[1] >= a[3] or b[0] >= b[2] or b[1] >= b[3]:
        return 0.0

    area_i = intersection(a, b)
    area_u = union(a, b, area_i)
    return float(area_i) / float(area_u + 1e-6)

#### Calcula propuesta de anchores 'foreground' and 'bg' de la imagen para el entrenamiento del modelo RPN
#def calc_rpn(C, img_data, width, height, resized_width, resized_height, img_length_calc_function):
def calc_rpn(C, img_data, width, height, img_length_calc_function):
    # Por cada punto del feature map obtiene los 9 (3 tamaños*3 ratios) anchores correspondientes en la imagen
    # Por cada anchor calcula el IoU con cada ground-truth bbox, y guarda el mejor anchor con cada gt bbox
    # Del mejor anchor en cada momento con cada gt bbox se almacena:
    #     best_x_for_gt_bbox: coordenadas (x1,x2,y1,y2) del anchor
    #     best_dx_for_gt_bbbox: deltas (diferencias) entre el anchor y el gt bbox
    #     best_iou_for_gt_bbox: IoU del mejor anchor
    #     best_anchor_for_gt_bbox: su punto (x,y) correspondiente en el feature map, tamaño y ratio del anchor
    # Cada mejor anchor se clasifica como 'pos' (IoU>0.7), 'neutral' o 'neg' (IoU<0.3), y se crean las estructuras:
    #     y_rpn_overlap (_x_x9): indica si el anchor se superpone con algún gt bbox de la imagen (si IoU>0.7)
    #     y_is_box_valid (_x_x9): indica si el anchor es 'pos' o 'neg' (se considera objeto o 'bg')
    #     y_rpn_regr (_x_x36): almacena los deltas (tx,ty,tw,th) de los anchores 'pos'
    # Si para un gt bbox no hay 'pos' anchor se selecciona el mejor de los existentes (sea cual sea su IoU)
    #
    # De las estructuras "y_is_box_valid" e "y_rpn_overlap" se extraen anchores 'pos' y 'neg'. Se limitan a un máximo
    # de 256 entre ambos, de forma balanceada, rechazando algunos si necesario y actualizando "y_is_box_valid"
    # Finalmente devuelve una propuesta de regiones que incluyen objetos ('pos') y background ('neg'):
    #    y_rpn_cls (18x_x_) = [y_is_box_valid (9x_x_) + y_rpn_overlap (9x_x_)]
    #    y_rpn_regr (72x_x_) = [(y_rpn_overlap * 4) (36x_x_) + y_rpn_regr (36x_x_)]
    #    num_pos, cantidad de anchores positivos
    downscale = float(C.rpn_stride)
    anchor_sizes = C.anchor_box_scales   # tamaños de los anchores
    anchor_ratios = C.anchor_box_ratios  # ratios de los anchores
    num_anchors = len(anchor_sizes) * len(anchor_ratios)

    # calcula tamaño del feature map para la imagen
#    (output_width, output_height) = img_length_calc_function(resized_width, resized_height)
    (output_width, output_height) = img_length_calc_function(width, height)

    n_anchratios = len(anchor_ratios)

    # crea e inicializa las estructuras necesarias
    y_rpn_overlap = np.zeros((output_height, output_width, num_anchors)) # si anchor se superpone con gt bbox
    y_is_box_valid = np.zeros((output_height, output_width, num_anchors)) # si anchor incluye un objecto
    y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4))

    num_gt_bboxes = len(img_data['bboxes']) # cantidad de gt bboxes en la imagen
    num_anchors_for_gt_bbox = np.zeros(num_gt_bboxes).astype(int) # cantidad anchores con IoU>0.7 con cada gt box
    best_anchor_for_gt_bbox = -1*np.ones((num_gt_bboxes, 4)).astype(int) # punto (x,y) en el feature map, tamaño y ratio
    best_iou_for_gt_bbox = np.zeros(num_gt_bboxes).astype(np.float32) # IoU del mejor anchor
    best_x_for_gt_bbox = np.zeros((num_gt_bboxes, 4)).astype(int) # coordenadas (x1,x2,y1,y2) del anchor
    best_dx_for_gt_bbox = np.zeros((num_gt_bboxes, 4)).astype(np.float32) # deltas entre el anchor y el gt bbox

    # adecuar las coordenadas de los gt bboxes, considerando el redimensionamiento de la imagen
    gta = np.zeros((num_gt_bboxes, 4))
    for bbox_num, bbox in enumerate(img_data['bboxes']):
        gta[bbox_num, 0] = bbox['x1'] #* (resized_width / float(width))
        gta[bbox_num, 1] = bbox['x2'] #* (resized_width / float(width))
        gta[bbox_num, 2] = bbox['y1'] #* (resized_height / float(height))
        gta[bbox_num, 3] = bbox['y2'] #* (resized_height / float(height))

    # Por cada punto del feature map calcula los 9 anchores correspondientes en la imagen
    # Para cada uno de los diferentes tipos de anchor
    for anchor_size_idx in range(len(anchor_sizes)):
        for anchor_ratio_idx in range(n_anchratios):
            anchor_x = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0]
            anchor_y = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1]

            # para cada uno de los puntos del feature map
            for ix in range(output_width):
                # coordenadas x1 y x2 del anchor en la imagen
                x1_anc = downscale * (ix + 0.5) - anchor_x / 2
                x2_anc = downscale * (ix + 0.5) + anchor_x / 2

                # ignora bboxes que salen de la imagen
#                if x1_anc < 0 or x2_anc > resized_width:
                if x1_anc < 0 or x2_anc > width:
                    continue

                for jy in range(output_height):
                    # coordenadas y1 e y2 del anchor en la imagen
                    y1_anc = downscale * (jy + 0.5) - anchor_y / 2
                    y2_anc = downscale * (jy + 0.5) + anchor_y / 2

                    # ignora bboxes que salen de la imagen
#                    if y1_anc < 0 or y2_anc > resized_height:
                    if y1_anc < 0 or y2_anc > height:
                        continue

                    # Initializa bbox a 'negativo'
                    bbox_type = 'neg'
                    # The best IOU for the (x,y) coord and the current anchor
                    best_iou_for_loc = 0.0

                    # Cada anchor lo comparo con cada gt bbox de la imagen
                    for bbox_num in range(num_gt_bboxes):
                        # IOU of the current gt box and the current anchor box
                        curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1_anc, y1_anc, x2_anc, y2_anc])

                        if curr_iou > best_iou_for_gt_bbox[bbox_num] or curr_iou > C.rpn_max_overlap:
                            # calculo los centros de gravedad del anchor y gt bbox
                            cx = (gta[bbox_num, 0] + gta[bbox_num, 1]) / 2.0
                            cy = (gta[bbox_num, 2] + gta[bbox_num, 3]) / 2.0
                            cxa = (x1_anc + x2_anc)/2.0
                            cya = (y1_anc + y2_anc)/2.0

                            # calculo los deltas, que son codificados según el autor del Faster:
                            # tx=(xgt-xan)/width_an, ty=(ygt-yan)/height_an, tw=ln(width_gt/width_an), th=ln(height_gt/height_an)
                            tx = (cx - cxa) / (x2_anc - x1_anc)
                            ty = (cy - cya) / (y2_anc - y1_anc)
                            tw = np.log((gta[bbox_num, 1] - gta[bbox_num, 0]) / (x2_anc - x1_anc))
                            th = np.log((gta[bbox_num, 3] - gta[bbox_num, 2]) / (y2_anc - y1_anc))

                        if img_data['bboxes'][bbox_num]['class'] != 'bg':
                            # Cada gt bbox debe ser abarcado por un anchor, buscamos el mejor anchor
                            if curr_iou > best_iou_for_gt_bbox[bbox_num]:
                                # guardamos datos del anchor: (jy,ix) son coordenadas en el feature map, tamaño y ratio
                                best_anchor_for_gt_bbox[bbox_num] = [jy, ix, anchor_ratio_idx, anchor_size_idx]
                                best_iou_for_gt_bbox[bbox_num] = curr_iou # IoU entre el anchor y el gt box
                                best_x_for_gt_bbox[bbox_num,:] = [x1_anc, x2_anc, y1_anc, y2_anc] # coordenadas del anchor
                                best_dx_for_gt_bbox[bbox_num,:] = [tx, ty, tw, th] # deltas entre anchor y gt box

                            # si IOU >threshold el anchor incluye un objeto
                            # (no importa si hay otro bbox mejor, solo refleja superposicion)
                            if curr_iou > C.rpn_max_overlap:
                                bbox_type = 'pos'
                                num_anchors_for_gt_bbox[bbox_num] += 1
                                # actualizamos los deltas si el IoU es el mejor hasta ahora
                                if curr_iou > best_iou_for_loc:
                                    best_iou_for_loc = curr_iou
                                    best_regr = (tx, ty, tw, th)

                            # es ambiguo, no se sabe si incluye o no un objeto
                            if C.rpn_min_overlap < curr_iou < C.rpn_max_overlap:
                                if bbox_type != 'pos':
                                    bbox_type = 'neutral'

                        # actualiza las estructuras en funcion del tipo de anchor
                        if bbox_type == 'neg':
                            y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                            y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                        elif bbox_type == 'neutral':
                            y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                            y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                        elif bbox_type == 'pos':
                            y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                            y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                            start = 4 * (anchor_ratio_idx + n_anchratios * anchor_size_idx)
                            y_rpn_regr[jy, ix, start:start+4] = best_regr

    # aseguramos que cada gt bbox tiene al menos un anchor que la solapa
    for idx in range(num_anchors_for_gt_bbox.shape[0]):
        if num_anchors_for_gt_bbox[idx] == 0:  # cantidad de anchors que tienen IoU>0.7 con cada gt box
            # si no hay anchor con IoU>0.7, selecciono el mejor de los existentes en best_anchor_for_gt_bbox
            if best_anchor_for_gt_bbox[idx, 0] == -1: # no hay bbox con IoU>0
                continue
            y_is_box_valid[best_anchor_for_gt_bbox[idx,0], best_anchor_for_gt_bbox[idx,1],
                           best_anchor_for_gt_bbox[idx,2] + n_anchratios * best_anchor_for_gt_bbox[idx,3]] = 1
            y_rpn_overlap[best_anchor_for_gt_bbox[idx,0], best_anchor_for_gt_bbox[idx,1],
                          best_anchor_for_gt_bbox[idx,2] + n_anchratios * best_anchor_for_gt_bbox[idx,3]] = 1
            start = 4 * (best_anchor_for_gt_bbox[idx,2] + n_anchratios * best_anchor_for_gt_bbox[idx,3])
            y_rpn_regr[best_anchor_for_gt_bbox[idx,0], best_anchor_for_gt_bbox[idx,1], start:start+4] = best_dx_for_gt_bbox[idx, :]

    y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1))
    y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0)

    y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1))
    y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0)

    y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1))
    y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0)

    # pos_locs almacena las coordenadas de los anchors que contienen objeto y overlap con gt bbox
    pos_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 1, y_is_box_valid[0, :, :, :] == 1))
    neg_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 0, y_is_box_valid[0, :, :, :] == 1))

    num_pos = len(pos_locs[0])
    num_neg = len(neg_locs[0])

    # la propuesta de regiones debe ser balanceada y limitada a 256 regiones
    num_regions = 256

    # hay mas regiones positivas que negativas
    if len(pos_locs[0]) > num_regions/2:
        # selecciona aleatoriamente tantos anchores positivos como los que exceden de num_regions/2
        val_locs = random.sample(range(num_pos), num_pos - num_regions/2)
        # los anchores seleccionados se marcan como neutrales (antes eran positivos)
        y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0
        num_pos = num_regions/2
    # hay mas regiones negativas que positivas
    if num_neg + num_pos > num_regions:
        # selecciona aleatoriamente tantos anchores negativos como los que exceden de num_pos
        val_locs = random.sample(range(num_neg), num_neg - num_pos)
        # los anchores seleccionados se marcan como neutrales (antes eran negativos)
        y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0

    y_rpn_cls = np.concatenate([y_is_box_valid, y_rpn_overlap], axis=1)
    y_rpn_regr = np.concatenate([np.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1)

    return np.copy(y_rpn_cls), np.copy(y_rpn_regr), num_pos

#### Calcula el nuevo tamaño de la imagen redimensionada
#def get_new_img_size(width, height, img_min_side=300):
#    if width <= height:
#        f = float(img_min_side) / width
#        resized_height = int(f * height)
#        resized_width = img_min_side
#    else:
#        f = float(img_min_side) / height
#        resized_width = int(f * width)
#        resized_height = img_min_side
#
#    return resized_width, resized_height

#### Funcion GENERADORA que devuelve los ground_truth anchors
def get_anchor_gt(all_img_data, C, img_length_calc_function):
    while True:

        for img_data in all_img_data:
            try:
                # lee imagen
                x_img = cv2.imread(img_data['filepath'])
                (width, height) = (img_data['width'], img_data['height'])
                (rows, cols, _) = x_img.shape
                assert cols == width
                assert rows == height

                # calcula tamaño de la imagen redimensionada
#                (resized_width, resized_height) = get_new_img_size(width, height, C.im_size)
                # redimensiona la imagen
#                x_img = cv2.resize(x_img, (resized_width, resized_height), interpolation=cv2.INTER_CUBIC)

                try:
#                    y_rpn_cls, y_rpn_regr, num_pos = calc_rpn(C, img_data, width, height, resized_width, resized_height, img_length_calc_function)
                    y_rpn_cls, y_rpn_regr, num_pos = calc_rpn(C, img_data, width, height, img_length_calc_function)
                except:
                    continue

                # Preprocesa imagen
                x_img = x_img[:,:, (2, 1, 0)]  # BGR -> RGB
                # adecua la imagen para la entrada en la funcion train del modelo RPN
                x_img = np.expand_dims(x_img, axis=0)
                # codifica los deltas con la varianza para normalizar los valores
                y_rpn_regr[:, y_rpn_regr.shape[1]//2:, :, :] *= C.std_scaling

                y_rpn_cls = np.transpose(y_rpn_cls, (0, 2, 3, 1))
                y_rpn_regr = np.transpose(y_rpn_regr, (0, 2, 3, 1))

                yield np.copy(x_img), [np.copy(y_rpn_cls), np.copy(y_rpn_regr)], img_data, num_pos

            except Exception as e:
                print(e)
                continue

#### Define las funciones de pérdida (loss functions) para cada salida de los modelos
lambda_rpn_regr = 1.0
lambda_rpn_class = 1.0

lambda_cls_regr = 1.0
lambda_cls_class = 1.0

epsilon = 1e-4

def rpn_loss_regr(num_anchors):
    def rpn_loss_regr_fixed_num(y_true, y_pred):
        # x is the difference between true value and predicted vaue
        x = y_true[:, :, :, 4 * num_anchors:] - y_pred
        x_abs = K.abs(x) # absolute value of x
        # If x_abs <= 1.0, x_bool = 1
        x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32)

        return lambda_rpn_regr * K.sum(
            y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors])

    return rpn_loss_regr_fixed_num

def rpn_loss_cls(num_anchors):
    def rpn_loss_cls_fixed_num(y_true, y_pred):

        return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors])

    return rpn_loss_cls_fixed_num

def class_loss_regr(num_classes):
    def class_loss_regr_fixed_num(y_true, y_pred):
        x = y_true[:, :, 4*num_classes:] - y_pred
        x_abs = K.abs(x)
        x_bool = K.cast(K.less_equal(x_abs, 1.0), 'float32')
        return lambda_cls_regr * K.sum(y_true[:, :, :4*num_classes] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :4*num_classes])
    return class_loss_regr_fixed_num

def class_loss_cls(y_true, y_pred):
    return lambda_cls_class * K.mean(categorical_crossentropy(y_true[0, :, :], y_pred[0, :, :]))

# Algoritmo NMS para evitar duplicidades en los bboxes delimitando un mismo objeto
def non_max_suppression_fast(boxes, probs, overlap_thresh=0.9, max_boxes=300):
    # codigo extraido de: http://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
    # Explicacion del proceso:
    #   Paso 1: Ordenar la lista de probabilidades
    #   Paso 2: Seleccionar la probabilidad más alta y copiarla en una lista aparte
    #   Paso 3: Calcular el IoU entre el bbox de la probabilidad seleccionada con el resto de bboxes en la lista
    #           Si (IoU > overlap_threshold) eliminar el bbox y probabilidad de su lista correspondiente
    #   Paso 4: Repetir los pasos 2 y 3 hasta vaciar la lista de probabilidades

    # si no hay bboxes devuelve una lista vacia
    if len(boxes) == 0:
        return []

    # captura las coordenadas de todos los bboxes
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    np.testing.assert_array_less(x1, x2)
    np.testing.assert_array_less(y1, y2)

    # las coordenadas de los bboxes son convertidas a floats para las divisiones
    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")

    # lista de indices seleccionados
    pick = []

    # calculo de las areas de todos los bboxes
    area = (x2 - x1) * (y2 - y1)

    # ordena las probabilidades (scores) de los bboxes en orden ascendente
    # el score más alto está el último
    idxs = np.argsort(probs)

    while len(idxs) > 0:
        # añade el último index (el de mayor score) de la lista "idx" a la lista "pick"
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)

        # busca las coordenadas más grandes (xmin,ymin) del top-left de cada bbox y
        # las más grandes (xmax,ymax) del bottom-right de cada bbox
        xx1_int = np.maximum(x1[i], x1[idxs[:last]])
        yy1_int = np.maximum(y1[i], y1[idxs[:last]])
        xx2_int = np.minimum(x2[i], x2[idxs[:last]])
        yy2_int = np.minimum(y2[i], y2[idxs[:last]])
        # calcular el ancho y alto de cada bbox
        ww_int = np.maximum(0, xx2_int - xx1_int)
        hh_int = np.maximum(0, yy2_int - yy1_int)

        # calcula la interseccion y la union
        area_int = ww_int * hh_int
        area_union = area[i] + area[idxs[:last]] - area_int
        # calcula el IoU
        overlap = area_int/(area_union + 1e-6)

        # elimina los indices de la lista "idx" con IoU > overlap_thresh, y el último index tambien
        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlap_thresh)[0])))

        if len(pick) >= max_boxes:
            break

    # devuelve aquellos bboxes seleccionados, cuyos index están almacenados en la lista "pick"
    boxes = boxes[pick].astype("int")
    probs = probs[pick]
    return boxes, probs

# aplica la correccion de los deltas predichos por el modelo RPN
def apply_regr_rpn(X, T):
    # corrige las coordenadas (x,y,w,h) del anchor según los deltas (tx,ty,tw,th)
    # Segun se indica en el paper original:
    # tx=(cx_gt-cx_anchor)/w_anchor, ty=(cy_gt-cy_anchor)/h_anchor, tw=log(w_gt/w_anchor), tw=log(h_gt/h_anchor)
    # Nota: np.exp() permite trabajar con arrays, mientras que math.exp() sólo con escalares
    try:
        x = X[0, :, :]
        y = X[1, :, :]
        w = X[2, :, :]
        h = X[3, :, :]

        tx = T[0, :, :]
        ty = T[1, :, :]
        tw = T[2, :, :]
        th = T[3, :, :]

        cx = x + w/2.
        cy = y + h/2.
        cx1 = tx * w + cx
        cy1 = ty * h + cy

        w1 = np.exp(tw.astype(np.float64)) * w
        h1 = np.exp(th.astype(np.float64)) * h
        x1 = cx1 - w1/2.
        y1 = cy1 - h1/2.

        x1 = np.round(x1)
        y1 = np.round(y1)
        w1 = np.round(w1)
        h1 = np.round(h1)
        return np.stack([x1, y1, w1, h1])
    except Exception as e:
        print(e)
        return X

# Selecciona las predicciones del modelo RPN para entrenar el modelo clasificador final
def calc_iou(R, img_data, C, class_mapping):
    # adecua las coordenadas de los gt bboxes, considerando el redimensionamiento de la imagen
    bboxes = img_data['bboxes']
#    (width, height) = (img_data['width'], img_data['height'])
    # tamaño de la imagen redimensionada
#    (resized_width, resized_height) = get_new_img_size(width, height, C.im_size)
    gta = np.zeros((len(bboxes), 4))
    for bbox_num, bbox in enumerate(bboxes):
#        gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride))
#        gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride))
#        gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride))
#        gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride))
        gta[bbox_num, 0] = int(round(bbox['x1'] / C.rpn_stride))
        gta[bbox_num, 1] = int(round(bbox['x2'] / C.rpn_stride))
        gta[bbox_num, 2] = int(round(bbox['y1'] / C.rpn_stride))
        gta[bbox_num, 3] = int(round(bbox['y2'] / C.rpn_stride))

    x_roi = [] # almacena coordenadas bbox seleccionadas
    y_class_num = [] # codificación de la clase ([1 0]='dorsal', [0 1]='bg')
    y_class_regr_coords = [] # almacena deltas del bbox seleccionado, sólo si clase es 'dorsal'
    y_class_regr_label = [] # almacena clase del bbox seleccionado

    # Para cada prediccion calcula el IoU con los gt bboxes de la imagen
    for ix in range(R.shape[0]): # R.shape[0]: cantidad de predicciones del modelo RPN (=300 si NMS)
        (x1, y1, x2, y2) = R[ix, :]
        x1 = int(round(x1))
        y1 = int(round(y1))
        x2 = int(round(x2))
        y2 = int(round(y2))

        best_iou = 0.0
        best_bbox = -1
        # Itera sobre los gt bboxes, calculando el IoU y buscando el gt bbox con mayor iou
        for bbox_num in range(len(bboxes)):
            curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2])

            if curr_iou > best_iou:
                best_iou = curr_iou
                best_bbox = bbox_num

        if best_iou < C.classifier_min_overlap:
            continue
        else:
            # si el IoU es mayor que el threshold selecciono la prediccion para entrenar el clasificador final
            w = x2 - x1
            h = y2 - y1
            x_roi.append([x1, y1, w, h])

            # preparo clase y bbox de la prediccion seleccionada para entrenar el clasificador final
            if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap:
                # clase='bg', no hace falta bbox
                cls_name = 'bg'
            elif C.classifier_max_overlap <= best_iou:
                # clase = la del gt bbox
                cls_name = bboxes[best_bbox]['class']
                # calcula deltas como diferencias entre gt bbox (el de mayor IoU) y la prediccion
                cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0
                cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0
                cx = x1 + w / 2.0
                cy = y1 + h / 2.0

                tx = (cxg - cx) / float(w)
                ty = (cyg - cy) / float(h)
                tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w))
                th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h))
            else:
                print('roi = {}'.format(best_iou))
                raise RuntimeError

        # codificación de la clase ([1 0]='dorsal', [0 1]='bg')
        class_num = class_mapping[cls_name]
        class_label = len(class_mapping) * [0]
        class_label[class_num] = 1
        y_class_num.append(copy.deepcopy(class_label))

        # crea listas de 4 huecos para almacenar las coordenadas y la clase
        coords = [0] * 4 * (len(class_mapping) - 1)
        labels = [0] * 4 * (len(class_mapping) - 1)
        if cls_name != 'bg':
            label_pos = 4 * class_num
            # normalizo deltas
            sx, sy, sw, sh = C.classifier_regr_std
            coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th]
            # codifico la clase
            labels[label_pos:4+label_pos] = [1, 1, 1, 1]
            # almacena clase y deltas del bbox seleccionado
            y_class_regr_coords.append(copy.deepcopy(coords))
            y_class_regr_label.append(copy.deepcopy(labels))
        else:
            # los deltas son irrelevantes si la clase es 'bg', se ponen a nulo
            y_class_regr_coords.append(copy.deepcopy(coords))
            y_class_regr_label.append(copy.deepcopy(labels))

    if len(x_roi) == 0:
        return None, None, None

    # predicciones del modelo RPN con IoU > threshold
    X = np.array(x_roi)
    # codificacion de la clase para las predicciones seleccionadas 'x_roi'
    Y1 = np.array(y_class_num)
    # clase y deltas de las predicciones seleccionadas 'x_roi'
    Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)], axis=1)

    return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0)

# define los ROIs a partir de las predicciones de scores y deltas de cada anchor por el modelo RPN
def rpn_to_roi(out_rpn_cls, out_rpn_regr, C, max_boxes=300, overlap_thresh=0.9):
    # Pasos:
    #   1. Calcula los bboxes de los ROIs: obtiene coordenadas de los anchores de cada punto del feature map
    #   2. Cada anchor es corregido por los deltas predichos por el modelo RPN
    #   3. Recorta aquellos bboxes que sobresalgan de la imagen
    #   4. Aplica NMS sobre los bboxes
    # Devuelve las coordenadas de los bboxes seleccionados (no los scores)

    # Decodificación deltas (deltas = deltas*0.25) - p.e. x=(x_gt-x_anc)/(w_anc*var) y w=ln(w_gt/w_anc)/var
    out_rpn_regr = out_rpn_regr / C.std_scaling

    anchor_sizes = C.anchor_box_scales   # (son 3)
    anchor_ratios = C.anchor_box_ratios  # (son 3)

    assert out_rpn_cls.shape[0] == 1
    (rows, cols) = out_rpn_cls.shape[1:3]

    # A.shape = (4, feature_map.height, feature_map.width, num_anchors) = (4,18,25,9) si la imagen es 400x300
    # A almacena las coordenadas de los 9 anchores por cada punto del feature map => 18x25x9=4050 anchores
    A = np.zeros((4, out_rpn_cls.shape[1], out_rpn_cls.shape[2], out_rpn_cls.shape[3]))

    curr_anchor = 0 # indica un anchor en el rango 0~8 (9 anchores)
    for anchor_size in anchor_sizes:
        for anchor_ratio in anchor_ratios:
            # ancho y alto del anchor en el feature map = (ancho * escala) / 16
            anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride
            anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride

            # regr almacena los deltas del current_anchor en todas las posiciones del feature map
            regr = out_rpn_regr[0, :, :, 4 * curr_anchor:4 * curr_anchor + 4] # shape => (18, 25, 4)
            regr = np.transpose(regr, (2, 0, 1)) # shape => (4, 18, 25)

            # Grid del mismo tamaño que el feature map
            X, Y = np.meshgrid(np.arange(cols),np. arange(rows))

            # Calcula coordenadas (x,y,w,h) del current_anchor en todas las posiciones del feature map
            A[0, :, :, curr_anchor] = X - anchor_x/2
            A[1, :, :, curr_anchor] = Y - anchor_y/2
            A[2, :, :, curr_anchor] = anchor_x
            A[3, :, :, curr_anchor] = anchor_y

            # corrige coordenadas (x,y,w,h) del anchor con deltas (tx,ty,tw,th) predecidos por el modelo RPN
            A[:, :, :, curr_anchor] = apply_regr_rpn(A[:, :, :, curr_anchor], regr)

            # Evita bboxes con altura o anchura menor que 1 (redondea a 1)
            A[2, :, :, curr_anchor] = np.maximum(1, A[2, :, :, curr_anchor])
            A[3, :, :, curr_anchor] = np.maximum(1, A[3, :, :, curr_anchor])

            # Convierte (x, y , w, h) => (x1, y1, x2, y2)
            A[2, :, :, curr_anchor] += A[0, :, :, curr_anchor]
            A[3, :, :, curr_anchor] += A[1, :, :, curr_anchor]

            # Recorta aquellos bboxes que sobresalgan de la imagen (o del feature map)
            A[0, :, :, curr_anchor] = np.maximum(0, A[0, :, :, curr_anchor])
            A[1, :, :, curr_anchor] = np.maximum(0, A[1, :, :, curr_anchor])
            A[2, :, :, curr_anchor] = np.minimum(cols-1, A[2, :, :, curr_anchor])
            A[3, :, :, curr_anchor] = np.minimum(rows-1, A[3, :, :, curr_anchor])

            curr_anchor += 1

    # almacena la informacion en forma de listas
    all_boxes = np.reshape(A.transpose((0, 3, 1, 2)), (4, -1)).transpose((1, 0))  # shape => (4050, 4)
    all_probs = out_rpn_cls.transpose((0, 3, 1, 2)).reshape((-1))                 # shape => (4050,)

    x1 = all_boxes[:, 0]
    y1 = all_boxes[:, 1]
    x2 = all_boxes[:, 2]
    y2 = all_boxes[:, 3]

    # Elimina bboxes con coordenadas erróneas
    idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0))
    all_boxes = np.delete(all_boxes, idxs, 0)
    all_probs = np.delete(all_probs, idxs, 0)

    # Non_max_suppression. Solo capturamos los bboxes, no necesitamos los scores
    result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0]
    return result

In [8]:
##########
# TRAINING
##########

%cd /content/drive/MyDrive/TFM/TrainFasterRCNN/

base_path = './'
train_path = 'annotateTrain.csv' # Training data (annotation file)
output_weight_path = os.path.join(base_path, 'model/model_frcnn_vgg.hdf5')
record_path = os.path.join(base_path, 'model/record.csv') # Record data (almacena diversos parametros tras cada epoch)
base_weight_path = os.path.join(base_path, '../VGG16_Weights/vgg16_weights_tf_dim_ordering_tf_kernels.h5')
config_output_filename = os.path.join(base_path, 'model/model_vgg_config.pickle')

num_rois = 4 # Numero de RoIs a procesar a la vez

# Crea objeto config
C = Config()
C.record_path = record_path
C.model_path = output_weight_path
C.num_rois = num_rois
C.base_net_weights = base_weight_path

print('\nCargando datos de anotaciones... ')
train_imgs, classes_count, class_mapping = get_data(train_path)

# incorpora codificación para la clase 'bg'
if 'bg' not in classes_count:
    classes_count['bg'] = 0
    class_mapping['bg'] = len(class_mapping)
C.class_mapping = class_mapping

print('\nDistribuion de las imagenes de entrenamiento por clase:')
pprint.pprint(classes_count)
print('Num clases (incluyendo bg) = {}'.format(len(classes_count)))
print(class_mapping)

# Almacena archivo de configuracion
with open(config_output_filename, 'wb') as config_f:
    pickle.dump(C,config_f)
    print('Config ha sido almacenado en {}, y debe ser cargado para obtener resultados correctos de test'.format(config_output_filename))

# Aleatoriza las imagenes de train
random.seed(1)
random.shuffle(train_imgs)
print('Num. imagenes de entrenamiento: {}'.format(len(train_imgs)))

# LA FUNCION GENERADORA ES EJECUTADA LLAMANDO AL METODO NEXT() DEL GENERADOR
data_gen_train = get_anchor_gt(train_imgs, C, get_img_output_length)

/content/drive/MyDrive/TFM/TrainFasterRCNN

Cargando datos de anotaciones... 
idx=2910

{'bib_number_1294.jpg', 'bib_number_1276.jpg', 'bib_number_1295.jpg', 'bib_number_1354.jpg', 'bib_number_1357.jpg', 'bib_number_1355.jpg', 'bib_number_1348.jpg', 'bib_number_1846.jpg', 'bib_number_1378.jpg', 'bib_number_1343.jpg', 'bib_number_1352.jpg', 'train.ipynb', 'bib_number_1356.jpg', 'bib_number_1288.jpg', 'bib_number_1622.jpg', 'bib_number_1375.jpg', 'bib_number_1305.jpg', 'bib_number_1379.jpg', 'model', 'annotateTrain.csv', 'bib_number_1370.jpg', 'bib_number_1373.jpg', 'bib_number_1064.jpg', 'bib_number_1307.jpg', 'bib_number_1364.jpg', 'bib_number_1368.jpg', 'bib_number_1713.jpg', 'bib_number_1843.jpg', 'bib_number_1610.jpg', 'bib_number_1310.jpg', 'bib_number_1333.jpg', 'bib_number_1292.jpg', 'bib_number_0126.jpg', 'bib_number_1380.jpg', 'bib_number_0201.jpg', 'bib_number_1330.jpg', 'bib_number_1290.jpg', 'bib_number_0409.jpg', 'bib_number_1358.jpg', 'bib_number_1277.jpg', 'bib_number_130

In [11]:
#### Build the model

# capa Input del modelo VGG (Imagenes RGB)
img_input = Input(shape=(None, None, 3))
# capa Input del modelo RoI Pooling
roi_input = Input(shape=(None, 4))

# define la red base (VGG16)
shared_layers = nn_base(img_input)

# define el modelo RPN
num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios)
rpn = rpn_layer(shared_layers, num_anchors)

# define el modelo clasificador final
classifier = classifier_layer(shared_layers, roi_input, C.num_rois, nb_classes=len(classes_count))

# Creamos los modelos
model_rpn = Model(img_input, rpn[:2])
model_classifier = Model([img_input, roi_input], classifier)

# modelo completo englobando RPN y clasificador final, con la finalidad de almacenar un solo archivo de pesos
model_all = Model([img_input, roi_input], rpn[:2] + classifier)

# we need to save the model and load the model to continue training
if not os.path.isfile(C.model_path):
    # Si comienza el entrenmamiento, carga los pesos del modelo preentrenado VGG16
    try:
        print('This is the first time of your training')
        print('loading weights from {}'.format(C.base_net_weights))
        model_rpn.load_weights(C.base_net_weights, by_name=True)
        model_classifier.load_weights(C.base_net_weights, by_name=True)
    except:
        print('Could not load pretrained model weights. Weights can be found in the keras application folder \
            https://github.com/fchollet/keras/tree/master/keras/applications')

    # Crea dataframe para almacenar losses, accuracy, etc
    record_df = pd.DataFrame(columns=['mean_overlapping_bboxes', 'class_acc', 'loss_rpn_cls', 'loss_rpn_regr', 'loss_class_cls', 'loss_class_regr', 'curr_loss'])
else:
    # Si continua un entrenmamiento previo, carga los pesos guardados del modelo Faster entrenado hasta entonces
    print('Continue training based on previous trained model')
    print('Loading weights from {}'.format(C.model_path))
    model_rpn.load_weights(C.model_path, by_name=True)
    model_classifier.load_weights(C.model_path, by_name=True)

    # Carga los valores registrados de losses, accuracy, etc almacenados en el archivo
    record_df = pd.read_csv(record_path)

    r_mean_overlapping_bboxes = record_df['mean_overlapping_bboxes']
    r_class_acc = record_df['class_acc']
    r_loss_rpn_cls = record_df['loss_rpn_cls']
    r_loss_rpn_regr = record_df['loss_rpn_regr']
    r_loss_class_cls = record_df['loss_class_cls']
    r_loss_class_regr = record_df['loss_class_regr']
    r_curr_loss = record_df['curr_loss']

optimizer = Adam(lr=1e-5)
optimizer_classifier = Adam(lr=1e-5)
model_rpn.compile(optimizer=optimizer, loss=[rpn_loss_cls(num_anchors), rpn_loss_regr(num_anchors)])
model_classifier.compile(optimizer=optimizer_classifier, loss=[class_loss_cls, class_loss_regr(len(classes_count)-1)], metrics={'dense_class_{}'.format(len(classes_count)): 'accuracy'})
model_all.compile(optimizer='sgd', loss='mae')

total_epochs = len(record_df)
r_epochs = len(record_df)

Continue training based on previous trained model
Loading weights from ./model/model_frcnn_vgg.hdf5


In [None]:
epoch_length = 1493 # cantidad de imagenes de training en cada epoch
num_epochs = 5 # cantidad de epochs
iter_num = 0

total_epochs += num_epochs

losses = np.zeros((epoch_length, 5))
rpn_accuracy_rpn_monitor = [] # predicciones del RPN con IoU>threshold con objetos
rpn_accuracy_for_epoch = []

# fija el valor más pequeño de las perdidas totales, para saber cuando almacenar los pesos del modelo en disco
if len(record_df)==0:
    best_loss = np.Inf
else:
    best_loss = np.min(r_curr_loss)

print("Cantidad de registros en el dataframe (Epochs previos): {}".format(len(record_df)))

start_time = time.time()
for epoch_num in range(num_epochs):
    progbar = Progbar(epoch_length)
    r_epochs += 1
    print('Epoch {}/{}'.format(r_epochs, total_epochs))

    while True:
        try:
            if len(rpn_accuracy_rpn_monitor) == epoch_length:
                mean_overlapping_bboxes = float(sum(rpn_accuracy_rpn_monitor))/len(rpn_accuracy_rpn_monitor)
                rpn_accuracy_rpn_monitor = []

            # X es la imagen resized
            # Y incluye estructuras que indican los anchores seleccionados como 'pos' y 'neg' (max 256)
            #    y_rpn_cls (18x_x_) = [y_is_box_valid (9x_x_) + y_rpn_overlap (9x_x_)]
            #    y_rpn_regr (72x_x_) = [(y_rpn_overlap * 4) (36x_x_) + y_rpn_regr (36x_x_)]
            # debug_num_pos indica la cantidad de anchores positivos
            X, Y, img_data, debug_num_pos = next(data_gen_train)

            # Entrena modelo rpn y obtiene valores de perdida global y de cada salida [loss, loss_rpn_cls, loss_rpn_regr]
            loss_rpn = model_rpn.train_on_batch(X, Y)

            # Prediccion del modelo RPN P_rpn = [rpn_cls, rpn_regr] = [scores (_x_x9), deltas (_x_x36)]
            P_rpn = model_rpn.predict_on_batch(X)

            # Corrige los anchores con las predicciones delta del modelo RPN y selecciona bboxes mediante NMS
            # R.shape = (300, 4)
            R = rpn_to_roi(P_rpn[0], P_rpn[1], C, overlap_thresh=0.7, max_boxes=300)

            # Selecciona las predicciones del modelo RPN para entrenar el modelo clasificador final
            # X2: predicciones del modelo RPN con IoU > threshold con los gt bboxes
            # Y1: codificacion de la clase para las predicciones seleccionadas en X2 ([1 0]='dorsal', [0 1]='bg')
            # Y2: clase y deltas de las predicciones seleccionadas en X2
            X2, Y1, Y2 = calc_iou(R, img_data, C, class_mapping)

            # si X2 esta vacio saltamos a la siguiente iteracion
            if X2 is None:
                rpn_accuracy_rpn_monitor.append(0)
                rpn_accuracy_for_epoch.append(0)
                continue

            # Busca predicciones positivas ('dorsal') y negativas ('bg')
            neg_samples = np.where(Y1[0, :, -1] == 1)
            pos_samples = np.where(Y1[0, :, -1] == 0)

            # pasa la estructura neg_samples y pos_samples de tupla a array
            if len(neg_samples) > 0:
                neg_samples = neg_samples[0]
            else:
                neg_samples = []

            if len(pos_samples) > 0:
                pos_samples = pos_samples[0]
            else:
                pos_samples = []

            # acumula las predicciones del RPN con IoU>threshold con objetos
            rpn_accuracy_rpn_monitor.append(len(pos_samples))
            rpn_accuracy_for_epoch.append((len(pos_samples)))

            if C.num_rois > 1:
                # Si numero de predicciones positivas es mayor que 4//2 = 2, seleccionamos 2 aleatoriamente
                if len(pos_samples) < C.num_rois//2:
                    selected_pos_samples = pos_samples.tolist()
                else:
                    selected_pos_samples = np.random.choice(pos_samples, C.num_rois//2, replace=False).tolist()

                # Seleccionamos aleatoriamente (num_rois - num_pos) predicciones negativas (`bg`)
                try:
                    selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=False).tolist()
                except:
                    if len(neg_samples)==0:
                      continue
                    selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=True).tolist()

                # Almacenar las predicciones positivas y negativas
                sel_samples = selected_pos_samples + selected_neg_samples
            else:
                # En el caso que num_rois = 1, seleccionamos una prediccion pos o neg aleatoriamente
                selected_pos_samples = pos_samples.tolist()
                selected_neg_samples = neg_samples.tolist()
                if np.random.randint(0, 2):
                    sel_samples = random.choice(neg_samples)
                else:
                    sel_samples = random.choice(pos_samples)

            #  X                     => img_data imagen redimensionada
            #  X2[:, sel_samples, :] => num_rois (4) bboxes que contienen pos y neg seleccionados
            #  Y1[:, sel_samples, :] => codificacion para num_rois bboxes seleccionados en X2
            #  Y2[:, sel_samples, :] => clase y deltas para num_rois bboxes seleccionados en X2
            # Entrena clasificador final y devuelve valor de perdida global e individual [loss, loss_cls, loss_regr, accuracy_cls]
            print(X.shape)
            loss_class = model_classifier.train_on_batch([X, X2[:, sel_samples, :]], [Y1[:, sel_samples, :], Y2[:, sel_samples, :]])

            # almacena las perdidas obtenidas (modelo rpn y clasificador final)
            losses[iter_num, 0] = loss_rpn[1]
            losses[iter_num, 1] = loss_rpn[2]

            losses[iter_num, 2] = loss_class[1]
            losses[iter_num, 3] = loss_class[2]
            losses[iter_num, 4] = loss_class[3]

            iter_num += 1

            progbar.update(iter_num, [('rpn_cls', np.mean(losses[:iter_num, 0])), ('rpn_regr', np.mean(losses[:iter_num, 1])),
                        ('final_cls', np.mean(losses[:iter_num, 2])), ('final_regr', np.mean(losses[:iter_num, 3]))])

            if iter_num == epoch_length:
                loss_rpn_cls = np.mean(losses[:, 0])
                loss_rpn_regr = np.mean(losses[:, 1])
                loss_class_cls = np.mean(losses[:, 2])
                loss_class_regr = np.mean(losses[:, 3])
                class_acc = np.mean(losses[:, 4])

                mean_overlapping_bboxes = float(sum(rpn_accuracy_for_epoch)) / len(rpn_accuracy_for_epoch)
                rpn_accuracy_for_epoch = []

                print('Mean number of bounding boxes from RPN overlapping ground truth boxes: {}'.format(mean_overlapping_bboxes))
                print('Classifier accuracy for bounding boxes from RPN: {}'.format(class_acc))
                print('Loss RPN classifier: {}'.format(loss_rpn_cls))
                print('Loss RPN regression: {}'.format(loss_rpn_regr))
                print('Loss Detector classifier: {}'.format(loss_class_cls))
                print('Loss Detector regression: {}'.format(loss_class_regr))
                print('Total loss: {}'.format(loss_rpn_cls + loss_rpn_regr + loss_class_cls + loss_class_regr))
                print('Elapsed time: {}'.format(time.time() - start_time))

                curr_loss = loss_rpn_cls + loss_rpn_regr + loss_class_cls + loss_class_regr
                if curr_loss < best_loss:
                    print('Total loss decreased from {} to {}, saving weights'.format(best_loss,curr_loss))
                    best_loss = curr_loss
                    model_all.save_weights(C.model_path)

                new_row = {'mean_overlapping_bboxes':round(mean_overlapping_bboxes, 3),
                            'class_acc':round(class_acc, 3),
                            'loss_rpn_cls':round(loss_rpn_cls, 3),
                            'loss_rpn_regr':round(loss_rpn_regr, 3),
                            'loss_class_cls':round(loss_class_cls, 3),
                            'loss_class_regr':round(loss_class_regr, 3),
                            'curr_loss':round(curr_loss, 3)}
                record_df = record_df.append(new_row, ignore_index=True)
                record_df.to_csv(record_path, index=0)

                iter_num = 0
                start_time = time.time()
                break # salir del while true y empezar otro epoch

        except Exception as e:
            print('Exception: {}'.format(e))
            continue

print('Training complete, exiting.')

Cantidad de registros en el dataframe (Epochs previos): 38
Epoch 39/43

(1, 1000, 1500, 3)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
   1/1493 [..............................] - ETA: 10:47:21 - rpn_cls: 1.2854 - rpn_regr: 0.0258 - final_cls: 1.8855 - final_regr: 0.5135(1, 1000, 1500, 3)
   2/1493 [..............................] - ETA: 6:20:31 - rpn_cls: 1.4944 - rpn_regr: 0.0213 - final_cls: 2.3484 - final_regr: 0.3852 







(1, 652, 1014, 3)
   3/1493 [..............................] - ETA: 5:18:19 - rpn_cls: 2.0348 - rpn_regr: 0.0490 - final_cls: 2.7930 - final_regr: 0.3138(1, 640, 960, 3)
   4/1493 [..............................] - ETA: 4:35:24 - rpn_cls: 2.3365 - rpn_regr: 0.0600 - final_cls: 2.8682 - final_regr: 0.2949







(1, 1067, 1600, 3)
   5/1493 [..............................] - ETA: 5:46:33 - rpn_cls: 2.3878 - rpn_regr: 0.0632 - final_cls: 2.8874 - final_regr: 0.2860(1, 1067, 1600, 3)
   6/1493 [...................

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(np.arange(0, r_epochs), record_df['mean_overlapping_bboxes'], 'r')
plt.title('mean_overlapping_bboxes')
plt.subplot(1,2,2)
plt.plot(np.arange(0, r_epochs), record_df['class_acc'], 'r')
plt.title('class_acc')
plt.show()

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(np.arange(0, r_epochs), record_df['loss_rpn_cls'], 'r')
plt.title('loss_rpn_cls')
plt.subplot(1,2,2)
plt.plot(np.arange(0, r_epochs), record_df['loss_rpn_regr'], 'r')
plt.title('loss_rpn_regr')
plt.show()

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(np.arange(0, r_epochs), record_df['loss_class_cls'], 'r')
plt.title('loss_class_cls')
plt.subplot(1,2,2)
plt.plot(np.arange(0, r_epochs), record_df['loss_class_regr'], 'r')
plt.title('loss_class_regr')
plt.show()

plt.plot(np.arange(0, r_epochs), record_df['curr_loss'], 'r')
plt.title('total_loss')
plt.show()