<center>
  <h1 style="font-family: Arial, sans-serif; font-weight: bold; font-size: 36px; color: #008CBA; text-shadow: 2px 2px #BDBDBD;">Classifying bees using Deep Learning</h1>
  <h2 style="font-family: Arial, sans-serif; font-weight: bold; font-size: 36px; color: #008CBA; text-shadow: 2px 2px #BDBDBD;">Fabio PEREIRA DE ARAUJO</h2>
  <h3 style="font-family: Arial, sans-serif; font-weight: bold; font-size: 36px; color: #008CBA; text-shadow: 2px 2px #BDBDBD;">Second-year engineering school internship</h3>
</center>



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
import numpy as np
import cv2 as cv
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers

In [3]:
!pip uninstall opencv-python-headless==4.5.5.62
!pip install opencv-python-headless==4.1.2.30
!pip install -q -U albumentations
!echo "$(pip freeze | grep albumentations) is successfully installed"

Found existing installation: opencv-python-headless 4.7.0.72
Uninstalling opencv-python-headless-4.7.0.72:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/cv2/*
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless-4.7.0.72.dist-info/*
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libavcodec-16a334ab.so.59.37.100
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libavformat-e0b1067c.so.59.27.100
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libavutil-82c407cb.so.57.28.100
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libcrypto-47343492.so.1.1
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libgfortran-91cc3cb1.so.3.0.0
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libopenblas-r0-f650aae0.3.3.so
    /usr/local/lib/python3.10/dist-packages/opencv_python_headless.libs/libpng16-57e5e0a0.so.16.37.0
    /usr/local/lib/p

In [4]:
from albumentations import (Compose, Rotate, HorizontalFlip, VerticalFlip, Affine, RandomBrightnessContrast, ChannelShuffle)
import albumentations as A

## Variables

In [5]:
IMG_SIZE = 224 #@param
BATCH_SIZE = 16 #@param
lr = 1e-3 #@param
train_path = "stage-2A/res/cap500/train" #@param
val_path = "stage-2A/res/cap500/val" #@param

## Téléchargement de la base de données

In [6]:
!git clone https://github.com/fabiopereira59/abeilles-cap500

Cloning into 'abeilles-cap500'...
remote: Enumerating objects: 24902, done.[K
remote: Counting objects: 100% (6150/6150), done.[K
remote: Compressing objects: 100% (6136/6136), done.[K
remote: Total 24902 (delta 11), reused 6150 (delta 11), pack-reused 18752[K
Receiving objects: 100% (24902/24902), 249.57 MiB | 21.36 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Updating files: 100% (24098/24098), done.


In [4]:
# Retrieve datasets for training (train, val)
# Shuffle to false to access images from
# their path with train_ds.file_paths
train_ds = keras.utils.image_dataset_from_directory(
    directory=train_path,
    labels='inferred',
    label_mode='categorical',
    shuffle = False,
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE))

validation_ds = keras.utils.image_dataset_from_directory(
    directory=train_path,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE))

Found 14917 files belonging to 71 classes.


In [7]:
# Retrieve datasets for training (train, val)
# Shuffle to false to access images from
# their path with train_ds.file_paths
train_ds = keras.utils.image_dataset_from_directory(
    directory="/content/abeilles-cap500/train",
    labels='inferred',
    label_mode='categorical',
    shuffle = False,
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE))

validation_ds = keras.utils.image_dataset_from_directory(
    directory="/content/abeilles-cap500/val",
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE))

Found 14917 files belonging to 71 classes.
Found 1832 files belonging to 71 classes.


In [12]:
class_names = train_ds.class_names
nb_classes = len(class_names)
nb_imgs_train = 14917
nb_imgs_val = 1932

## Augmentation de données : Sequence et Albumentations

In [13]:
AUGMENTATIONS_TRAIN = Compose([
    Rotate(limit=[0,100], p=0.5),
    HorizontalFlip(p=0.5),
    VerticalFlip(p=0.5),
    Affine(shear=[-45, 45], p=0.5),
    RandomBrightnessContrast(p=0.5)
])

In [14]:
class AbeillesSequence(Sequence):
    # Sequence initialisation with different parameters
    def __init__(self, x_train, y_train, batch_size, augmentations):
        self.x_train = x_train
        self.y_train = y_train
        self.classes = class_names
        self.batch_size = batch_size
        self.augment = augmentations
        self.indices1 = np.arange(len(x_train))
        np.random.shuffle(self.indices1) # The indices provide access
        # data and are randomised at each epoch to vary the composition
        # of batches during training

    # Function calculating the number of gradient descent steps per epoch
    def __len__(self):
        return int(np.ceil(x_train.shape[0] / float(self.batch_size)))
    
    # Apply data augmentation to each image in the batch
    def apply_augmentation(self, bx, by):

        batch_x = np.zeros((bx.shape[0], IMG_SIZE, IMG_SIZE, 3))
        batch_y = by
        
        for i in range(len(bx)):
            class_labels = []
            class_id = np.argmax(by[i])
            class_labels.append(self.classes[class_id])

            # Applying augmentation to images
            img = cv.imread(bx[i])
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            transformed = self.augment(image=img)
            batch_x[i] = transformed['image']
      
        return batch_x, batch_y

    # Function called for each new batch: selection and increase of data
    # idx = batch position (idx = 5 => the 5th batch is taken)
    def __getitem__(self, idx):
        batch_x = self.x_train[self.indices1[idx * self.batch_size:(idx + 1) * self.batch_size]]
        batch_y = self.y_train[self.indices1[idx * self.batch_size:(idx + 1) * self.batch_size]]
           
        batch_x, batch_y = self.apply_augmentation(batch_x, batch_y)

        # Data normalisation
        batch_x = tf.keras.applications.resnet.preprocess_input(batch_x)
        
        return batch_x, batch_y

    # Function called at the end of an epoch; data access indices are randomised
    def on_epoch_end(self):
        np.random.shuffle(self.indices1)

In [17]:
# Images are stored with paths

x_train = np.array(train_ds.file_paths)
y_train = np.zeros((nb_imgs_train, nb_classes))

ind_data = 0
for bx, by in train_ds.as_numpy_iterator():
  y_train[ind_data:ind_data+bx.shape[0]] = by
  ind_data += bx.shape[0]

In [18]:
# Sequence Instantiation
train_ds_aug = AbeillesSequence(x_train, y_train, batch_size=16, augmentations=AUGMENTATIONS_TRAIN)

# Normalisation of validation data

x_val = np.zeros((nb_imgs_val, IMG_SIZE, IMG_SIZE, 3))
y_val = np.zeros((nb_imgs_val, nb_classes))

ind_data = 0
for bx, by in validation_ds.as_numpy_iterator():
  x_val[ind_data:ind_data+bx.shape[0]] = bx
  y_val[ind_data:ind_data+bx.shape[0]] = by
  ind_data += bx.shape[0]

x_val = tf.keras.applications.resnet.preprocess_input(x_val)

## Création du modèle

### Poids d'imagenet

In [19]:
conv_base = keras.applications.resnet.ResNet101(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling=None,
    classes=nb_classes,
)

model = keras.Sequential(
    [
        conv_base,
        layers.GlobalAveragePooling2D(),
        layers.Dense(nb_classes, kernel_regularizer=regularizers.L2(1e-4), activation='softmax')
    ]
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5


In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet101 (Functional)      (None, 7, 7, 2048)        42658176  
                                                                 
 global_average_pooling2d (G  (None, 2048)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 71)                145479    
                                                                 
Total params: 42,803,655
Trainable params: 42,698,311
Non-trainable params: 105,344
_________________________________________________________________


## Hierarchical loss

In [21]:
import pandas as pd
import numpy as np

hierarchie = pd.read_csv("/content/drive/MyDrive/ENSEEIHT/2A/Stage 2A/src/training/csv/hierarchie_especes71.csv")

species = hierarchie["species"].unique()
nb_species = len(species)

genus = list(hierarchie["genus"].unique())
nb_genus = len(genus)

family = list(hierarchie["family"].unique())
nb_family = len(family)

subfamily = list(hierarchie["subfamily"].unique())
nb_subfamily = len(subfamily)

#hierarchie.set_index("species", inplace=True)
data = pd.read_csv("/content/drive/MyDrive/ENSEEIHT/2A/Stage 2A/src/training/csv/liste_classes_71.csv")
#data.set_index("species", inplace=True)

species_to_genus = np.zeros((nb_genus, nb_species))
genus_to_subfamily = np.zeros((nb_subfamily, nb_genus))
subfamily_to_family = np.zeros((nb_family, nb_subfamily))
for i in range(nb_species):
  nb_images = data.at[i, "0"]
  # species -> genus
  genus_species = hierarchie.at[i, "genus"]
  ind_genus = genus.index(genus_species)
  species_to_genus[ind_genus, i] = 1

  # genus -> subfamily
  subfamily_species = hierarchie.at[i, "subfamily"]
  ind_subfamily = subfamily.index(subfamily_species)
  genus_to_subfamily[ind_subfamily, ind_genus] = 1

  # subfamily -> family
  family_species = hierarchie.at[i, "family"]
  ind_family = family.index(family_species)
  subfamily_to_family[ind_family, ind_subfamily] = 1

In [22]:
from numpy.ma.core import transpose
from keras import backend as K
import math
import tensorflow as tf

# Definition of the loss function
def Hierarchicaloss(species_to_genus, genus_to_subfamily, subfamily_to_family, batch_size, alpha=0.1):

    def weight(height=1):
      return math.exp(-alpha * height)
    
    def species_loss(y_true, y_pred):
      height = 0
      return weight(height) * K.categorical_crossentropy(y_true, y_pred)
  
    def species_to_genus_loss(y_true, y_pred):
      height = 1
      y_true_genus = K.transpose(tf.raw_ops.MatMul(a=species_to_genus, b=tf.cast(y_true, tf.float64), transpose_b=True))
      y_pred_genus = K.transpose(tf.raw_ops.MatMul(a=species_to_genus, b=tf.cast(y_pred, tf.float64), transpose_b=True))
      return weight(height) * K.categorical_crossentropy(y_true_genus, y_pred_genus), y_true_genus, y_pred_genus
    
    def genus_to_subfamily_loss(y_true, y_pred):
      height = 2
      y_true_subfamily = K.transpose(tf.raw_ops.MatMul(a=genus_to_subfamily, b=y_true, transpose_b=True))
      y_pred_subfamily = K.transpose(tf.raw_ops.MatMul(a=genus_to_subfamily, b=y_pred, transpose_b=True))
      return weight(height) * K.categorical_crossentropy(y_true_subfamily, y_pred_subfamily), y_true_subfamily, y_pred_subfamily
    
    def subfamily_to_family_loss(y_true, y_pred):
      height = 3
      y_true_family = K.transpose(tf.raw_ops.MatMul(a=subfamily_to_family, b=y_true, transpose_b=True))
      y_pred_family = K.transpose(tf.raw_ops.MatMul(a=subfamily_to_family, b=y_pred, transpose_b=True))
      return weight(height) * K.categorical_crossentropy(y_true_family, y_pred_family)

    def HIERARCHICAL_loss(y_true, y_pred):
      loss_species = tf.cast(species_loss(y_true, y_pred), tf.float64)
      loss_genus, y_true_genus, y_pred_genus = species_to_genus_loss(y_true, y_pred)
      loss_subfamily, y_true_subfamily, y_pred_subfamily = genus_to_subfamily_loss(y_true_genus, y_pred_genus)
      loss_family = subfamily_to_family_loss(y_true_subfamily, y_pred_subfamily)
      return (loss_species + loss_genus + loss_subfamily + loss_family)/batch_size
   
    # Return a function
    return HIERARCHICAL_loss

In [23]:
loss=[Hierarchicaloss(species_to_genus, genus_to_subfamily, subfamily_to_family, batch_size=16, alpha=0.5)]

## Entraînement du modèle

In [24]:
# Ajout de l'optimiseur, de la fonction coût et des métriques
model.compile(optimizers.SGD(learning_rate=lr, momentum=0.9), loss=loss, metrics=['categorical_accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [25]:
# Les callbacks
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath="/content/poids",
    save_weights_only=True,
    monitor='val_categorical_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1)

#early_stopping_cb = tf.keras.callbacks.EarlyStopping(
#    monitor="val_categorical_accuracy",
#    min_delta=0.01,
#    patience=8,
#    verbose=1,
#    mode="auto")

reduce_lr_cb = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.1,
                              patience=5, min_lr=0.00001, verbose=1)

In [None]:
history = model.fit(train_ds_aug, epochs=150, validation_data = (x_val, y_val), callbacks=[model_checkpoint_cb, reduce_lr_cb])

Epoch 1/150
