In [1]:
!pip install detecto


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from detecto import core, utils
from detecto.visualize import show_labeled_image
from torchvision import transforms
import numpy as np

In [3]:
path_train_images = 'train_images'
path_test_images = 'test_images'
path_train_labels = 'train_labels'
path_test_labels = 'test_labels'

In [4]:
import os
from PIL import Image
from xml.etree import ElementTree

def find_valid_and_modify(label_folder, image_folder) -> None:
    for fname in os.listdir(label_folder):
        if fname.endswith('.xml'):
            xml_path = os.path.join(label_folder, fname)
            img_name = fname.replace('.xml', '.jpg')
            img_path = os.path.join(image_folder, img_name)
            # Nếu ảnh không tồn tại thì xóa luôn file label
            if not os.path.exists(img_path):
                print(f"Không tìm thấy ảnh: {img_path}, xóa {xml_path}")
                os.remove(xml_path)
                continue
            # Lấy kích thước ảnh thực tế
            with Image.open(img_path) as img:
                width, height = img.size
            # Nếu width hoặc height = 0 thì xóa luôn file label
            if width == 0 or height == 0:
                print(f"Ảnh {img_path} có width hoặc height = 0, xóa {xml_path}")
                os.remove(xml_path)
                continue
            # Sửa file xml nếu cần (nếu không muốn sửa thì bỏ đoạn này)
            tree = ElementTree.parse(xml_path)
            root = tree.getroot()
            size_tag = root.find('size')
            if size_tag is not None:
                size_tag.find('width').text = str(width)
                size_tag.find('height').text = str(height)
                tree.write(xml_path)

In [5]:
find_valid_and_modify(path_train_labels, path_train_images)
find_valid_and_modify(path_test_labels, path_test_images)

In [6]:
# data augmentation
custom_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((50)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    utils.normalize_transform()

])

In [7]:
# dataset and dataloder
trained_labels = ['apple', 'banana', 'orange']

train_dataset = core.Dataset(image_folder=path_train_images, label_data=path_train_labels, transform=custom_transforms)
test_dataset = core.Dataset(image_folder=path_test_images, label_data=path_test_labels, transform=custom_transforms)

In [8]:
train_loader = core.DataLoader(train_dataset, batch_size=6, shuffle=True)
test_loader = core.DataLoader(test_dataset, batch_size=6, shuffle=True)

In [9]:
# initialize model
model = core.Model(trained_labels)



In [10]:
# internal model structure
model.get_internal_model()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
# perform the training
losses = model.fit(train_loader, test_dataset, epochs=2, verbose=True)

It looks like you're training your model on a CPU. Consider switching to a GPU; otherwise, this method can take hours upon hours or even days to finish. For more information, see https://detecto.readthedocs.io/en/latest/usage/quickstart.html#technical-requirements
Epoch 1 of 2
Begin iterating over training dataset


  boxes = torch.tensor(boxes).view(-1, 4)
 48%|████▊     | 19/40 [11:44<16:35, 47.39s/it]

In [None]:
test_image_path = 'test_images/apple_77.jpg'
test_image = utils.read_image(test_image_path)
pred = model.predict(test_image)
labels, boxes, scores = pred
show_labeled_image(test_image, boxes, labels)

In [None]:
conf_threshold = 0.7
filtered_indices = np.where(scores > conf_threshold)
filteres_scores = scores[filtered_indices]
filtered_boxes = boxes[filtered_indices]
num_list = filtered_indices[0].tolist()
filtered_labels = [labels[i] for i in num_list]
show_labeled_image(test_image, filtered_boxes, filtered_labels)

In [None]:
# get predictions
y_test_pred = []

import torch
with torch.no_grad():
    for j, data in enumerate(test_loader):
        image, label = data
        outputs = model.predict(image)
        y_test_pred.extend(outputs)

        break