In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
from pathlib import Path

project_dir = Path("/content/drive/MyDrive")

In [None]:
import zipfile

zip_path = project_dir / "rare_species 1.zip"
extract_path = project_dir / "rare_species_1"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("ZIP extracted with sucess")

ZIP extraído com sucesso!


In [39]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# ============================== #
# Python Standard Library
# ============================== #
from typing import Self, Any
from pathlib import Path

# ============================== #
# External Libraries
# ============================== #
from matplotlib.image import imread

# ============================== #
# Keras: Data Loading & Augmentation
# ============================== #
from keras.utils import image_dataset_from_directory
from keras.layers import Rescaling, RandAugment

# ============================== #
# Keras: Model Building
# ============================== #
from keras import Model
from keras.applications import VGG16
from keras.layers import Flatten, Dense

# ============================== #
# Keras: Training Components
# ============================== #
from keras.optimizers import SGD
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy, AUC, F1Score
from keras.callbacks import ModelCheckpoint, CSVLogger, LearningRateScheduler

# ============================== #
# Local Utilities
# ============================== #
from src.utils import show_image, exp_decay_lr_scheduler


In [52]:
import random
import shutil
from pathlib import Path

# List the species
species_list = ["arthropoda_apidae", "arthropoda_attelabidae"]

data_dir_path = project_dir / "data"

def split_and_copy_species(species):
    source_dir = extract_path / species

    # Creat the paths for train/val/test
    for split in ['train', 'val', 'test']:
        (data_dir_path / split / species).mkdir(parents=True, exist_ok=True)

    # Extract and shuffle or images
    images = list(source_dir.glob("*.jpg"))
    random.shuffle(images)

    # Splitting
    n_total = len(images)
    n_train = int(0.7 * n_total)
    n_val = int(0.15 * n_total)

    train_imgs = images[:n_train]
    val_imgs = images[n_train:n_train + n_val]
    test_imgs = images[n_train + n_val:]

    # Copy
    def copy_images(img_list, destination):
        for img in img_list:
            shutil.copy(img, destination / species)

    copy_images(train_imgs, data_dir_path / "train")
    copy_images(val_imgs, data_dir_path / "val")
    copy_images(test_imgs, data_dir_path / "test")

    print(f"{species} -> Train: {len(train_imgs)}, Val: {len(val_imgs)}, Test: {len(test_imgs)}")

# Aply to all species
for species in species_list:
    split_and_copy_species(species)


arthropoda_apidae -> Train: 105, Val: 22, Test: 23
arthropoda_attelabidae -> Train: 21, Val: 4, Test: 5


In [53]:
train_dir_path = data_dir_path / "train"
val_dir_path = data_dir_path / "val"
test_dir_path = data_dir_path / "test"

In [54]:
n_classes = 2 # to edit
batch_size = 64
input_shape = (224, 224, 3)
image_size = (224, 224)
value_range = (0.0, 1.0)

In [64]:
train_ds = image_dataset_from_directory(
    train_dir_path,
    label_mode="categorical",
    batch_size=batch_size,
    image_size=image_size,
    interpolation="bilinear",
    verbose=False
)

In [65]:
val_ds = image_dataset_from_directory(
    val_dir_path,
    label_mode="categorical",
    batch_size=batch_size,
    image_size=image_size,
    interpolation="bilinear",
    shuffle=False,
    verbose=False
)

In [66]:
test_ds = image_dataset_from_directory(
    test_dir_path,
    label_mode="categorical",
    batch_size=batch_size,
    image_size=image_size,
    interpolation="bilinear",
    shuffle=False,
    verbose=False
)

In [67]:
class AugmentedVGG16(Model):
    """
    Pre-trained VG16 + RandAugment
    """

    def __init__(self: Self) -> None:
        """
        Initialization
        """

        super().__init__()

        self.n_classes = n_classes
        self.rescale_layer = Rescaling(scale=1 / 255.0)
        self.augmentation_layer = RandAugment(value_range=value_range)
        self.pre_trained_architecture = VGG16(include_top=False, classes=32)
        self.flatten_layer = Flatten()
        self.dense_layer = Dense(self.n_classes, activation="softmax")

    def call(self: Self, inputs: Any) -> Any:
        """
        Forward call
        """

        x = self.rescale_layer(inputs)
        x = self.augmentation_layer(x)
        x = self.pre_trained_architecture(x)
        x = self.flatten_layer(x)

        return self.dense_layer(x)

In [68]:
epochs = 4
model = AugmentedVGG16()
optimizer = SGD(learning_rate=0.01, name="optimizer")
loss = CategoricalCrossentropy(name="loss")

In [69]:
# metrics
categorical_accuracy = CategoricalAccuracy(name="accuracy")
auc = AUC(name="auc")
f1_score = F1Score(average="macro", name="f1_score")
metrics = [categorical_accuracy, auc, f1_score]

In [70]:
# callbacks
root_dir_path = Path(".")
checkpoint_file_path = root_dir_path / "checkpoint.keras"
metrics_file_path = root_dir_path / "metrics.csv"

checkpoint_callback = ModelCheckpoint(
    checkpoint_file_path,
    monitor="val_loss",
    verbose=0
)
metrics_callback = CSVLogger(metrics_file_path)
lr_scheduler_callback = LearningRateScheduler(exp_decay_lr_scheduler)

callbacks = [
    checkpoint_callback,
    metrics_callback,
    lr_scheduler_callback
]

In [71]:
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [72]:
# train the model, call to the method is somewhat diferent
_ = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/4
3/3 - 422s - 141s/step - accuracy: 0.6550 - auc: 0.7632 - f1_score: 0.4397 - loss: 3.5700 - val_accuracy: 0.0548 - val_auc: 0.0814 - val_f1_score: 0.0519 - val_loss: 0.9184 - learning_rate: 0.0095
Epoch 2/4
3/3 - 399s - 133s/step - accuracy: 0.4386 - auc: 0.5510 - f1_score: 0.4007 - loss: 312355.1250 - val_accuracy: 0.9452 - val_auc: 0.0000e+00 - val_f1_score: 0.0000e+00 - val_loss: nan - learning_rate: 0.0090
Epoch 3/4
3/3 - 427s - 142s/step - accuracy: 0.8772 - auc: 0.0000e+00 - f1_score: 0.0000e+00 - loss: nan - val_accuracy: 0.9452 - val_auc: 0.0000e+00 - val_f1_score: 0.0000e+00 - val_loss: nan - learning_rate: 0.0086
Epoch 4/4
3/3 - 438s - 146s/step - accuracy: 0.8772 - auc: 0.0000e+00 - f1_score: 0.0000e+00 - loss: nan - val_accuracy: 0.9452 - val_auc: 0.0000e+00 - val_f1_score: 0.0000e+00 - val_loss: nan - learning_rate: 0.0081


In [73]:
# evaluate on the test ds
model.evaluate(
    test_ds,
    return_dict=True,
    verbose=0
)

{'accuracy': 0.9367088675498962, 'auc': 0.0, 'f1_score': 0.0, 'loss': nan}