# 0. Download Dataset

In [1]:
import sys
sys.path.append('..')
from utils.gdrive_downloader import GDriveDownloader
from utils.memory_tracker import MemoryTracker, safe_to_device

data_dir = './data'

In [2]:
gdrive_downloader = GDriveDownloader(cache_dir='./cache')
success, message = gdrive_downloader.download_and_extract(
    gdrive_url='https://drive.google.com/file/d/1kUy2tuH-kKBlFCNA0a9sqD2TG4uyvBnV/view?usp=sharing',
    extract_dir=data_dir,
    keep_zip=False,
)
if success:
    print(f'Success: {message}')
else:
    print(f'Error: {message}')

In [3]:
import torch.backends.cudnn as cudnn

cudnn.benchmark = False
cudnn.deterministic = True
cudnn.enabled = False

In [4]:
import ultralytics

ultralytics.checks()

Ultralytics 8.3.40 🚀 Python-3.12.7 torch-2.3.0 CUDA:0 (NVIDIA GeForce GTX 1050 Ti, 4032MiB)
Setup complete ✅ (12 CPUs, 15.5 GB RAM, 271.7/287.3 GB disk)


# 1. Load data

## Extract from xml

In [5]:
import os
import xml.etree.ElementTree as ET

from typing import List

class ImageInfo:
    def __init__(self, path, size, labels, bboxes):
        self.__path = path
        self.__size = size
        self.__labels = labels
        self.__bboxes = bboxes

    def get_path(self):
        return self.__path
    
    def get_size(self):
        return self.__size
    
    def get_labels(self):
        return self.__labels
    
    def get_label(self, idx):
        return self.__labels[idx]
    
    def get_bboxes(self):
        return self.__bboxes
    
    def get_bbox(self, idx):
        return self.__bboxes[idx]
    
    def __repr__(self):
        s = 'Image info:\n'
        s += f'\tPath: {self.get_path()}\n'
        s += f'\tSize: {self.get_size()}\n'
        s += '\tBBoxes:\n' 

        for i in range(len(self.get_labels())):
            s += f'\t\tLabel: {self.get_label(i)} - {self.get_bbox(i)}\n'

        return s


def extract_data_from_xml(root_dir: str) -> List[ImageInfo]:
    xml_path = os.path.join(root_dir, 'words.xml')
    tree = ET.parse(xml_path)
    root = tree.getroot()

    imgs = []

    for img in root:
        bbs_of_img = []
        labels_of_img = []

        for bbs in img.findall('taggedRectangles'):
            for bb in bbs:
                # check tag
                if not bb[0].text or not bb[0].text.isalnum():
                    continue

                bbs_of_img.append(
                    [
                        float(bb.attrib['x']),
                        float(bb.attrib['y']),
                        float(bb.attrib['width']),
                        float(bb.attrib['height']),
                    ]
                )
                labels_of_img.append(bb[0].text.lower())
        
        img_path = os.path.join(root_dir, img[0].text)
        img_sizes = (int(img[1].attrib['x']), int(img[1].attrib['y']))
        img_info = ImageInfo(img_path, img_sizes, labels_of_img, bbs_of_img)
        imgs.append(img_info)
    
    return imgs


In [6]:
dataset_dir = 'data/SceneTrialTrain'
imgs = extract_data_from_xml(dataset_dir)
imgs[0]

Image info:
	Path: data/SceneTrialTrain/apanar_06.08.2002/IMG_1261.JPG
	Size: (1600, 1200)
	BBoxes:
		Label: self - [174.0, 392.0, 274.0, 195.0]
		Label: adhesive - [512.0, 391.0, 679.0, 183.0]
		Label: address - [184.0, 612.0, 622.0, 174.0]
		Label: labels - [863.0, 599.0, 446.0, 187.0]
		Label: 36 - [72.0, 6.0, 95.0, 87.0]
		Label: 89m - [247.0, 2.0, 197.0, 88.0]
		Label: cls - [792.0, 0.0, 115.0, 81.0]
		Label: 250 - [200.0, 848.0, 228.0, 139.0]
		Label: on - [473.0, 878.0, 165.0, 109.0]
		Label: a - [684.0, 878.0, 71.0, 106.0]
		Label: roll - [806.0, 844.0, 218.0, 141.0]

## Convert to YOLOv11 format

- YOLO format has 3 main dir: `train`, `test`, `val`. Each contains `images` and `labels` dir

In [7]:
from typing import List

def convert_to_yolo_format(imgs: List[ImageInfo]):
    yolo_data = []

    for img in imgs:
        width, height = img.get_size()
        yolo_labels = []

        for bbox in img.get_bboxes():
            x, y, w, h = bbox

            center_x = (x + w / 2) / width
            center_y = (y + h / 2) / height
            normalized_w = w / width
            normalized_h = h / height

            # have 1 class so class_id set to 0
            class_id = 0

            yolo_label = f'{class_id} {center_x} {center_y} {normalized_w} {normalized_h}'
            yolo_labels.append(yolo_label)
    
        yolo_data.append((img.get_path(), yolo_labels))
    
    return yolo_data

In [8]:
class_labels = ['text']
yolo_data = convert_to_yolo_format(imgs)
yolo_data[0]

('data/SceneTrialTrain/apanar_06.08.2002/IMG_1261.JPG',
 ['0 0.194375 0.40791666666666665 0.17125 0.1625',
  '0 0.5321875 0.40208333333333335 0.424375 0.1525',
  '0 0.309375 0.5825 0.38875 0.145',
  '0 0.67875 0.5770833333333333 0.27875 0.15583333333333332',
  '0 0.0746875 0.04125 0.059375 0.0725',
  '0 0.2159375 0.03833333333333333 0.123125 0.07333333333333333',
  '0 0.5309375 0.03375 0.071875 0.0675',
  '0 0.19625 0.7645833333333333 0.1425 0.11583333333333333',
  '0 0.3471875 0.7770833333333333 0.103125 0.09083333333333334',
  '0 0.4496875 0.7758333333333334 0.044375 0.08833333333333333',
  '0 0.571875 0.7620833333333333 0.13625 0.1175'])

In [9]:
# save data for later training

import shutil


def save_data(data, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    os.makedirs(os.path.join(save_dir, 'images'), exist_ok=True)
    os.makedirs(os.path.join(save_dir, 'labels'), exist_ok=True)

    for image_path, yolo_labels in data:
        shutil.copy(
            image_path,
            os.path.join(save_dir, "images"),
        )

        image_name = os.path.basename(image_path)
        image_name = os.path.splitext(image_name)[0]

        with open(os.path.join(save_dir, 'labels', f'{image_name}.txt'), 'w') as f:
            for label in yolo_labels:
                f.write(f'{label}\n')

In [10]:
from sklearn.model_selection import train_test_split


seed = 0
val_size = 0.2
test_size = 0.125
is_shuffle = True

train_data, test_data = train_test_split(
    yolo_data,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle,
)

test_data, val_data = train_test_split(
    test_data,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle,
)

save_yolo_data_dir =  'datasets/yolo_data'
os.makedirs(save_yolo_data_dir, exist_ok=True)
save_train_dir = os.path.join(save_yolo_data_dir, 'train')
save_val_dir = os.path.join(save_yolo_data_dir, 'val')
save_test_dir = os.path.join(save_yolo_data_dir, 'test')

save_data(train_data, save_train_dir)
save_data(val_data, save_val_dir)
save_data(test_data, save_test_dir)

In [11]:
import yaml


data_yml = {
    'path': 'yolo_data',
    'train': 'train/images',
    'test': 'test/images',
    'val': 'val/images',
    'nc': 1,
    'names': class_labels,
}

yolo_yaml_path = os.path.join(save_yolo_data_dir, 'data.yml')
with open(yolo_yaml_path, 'w') as f:
    yaml.dump(data_yml, f, default_flow_style=False)

# 2. Model

In [12]:
from ultralytics import YOLO

MemoryTracker.clear_memory()

model = YOLO('yolo11m.pt')

results = model.train(
    data=yolo_yaml_path,
    epochs=100,
    imgsz=640,
    device='cpu',
    workers=2,
    cache=True,
    patience=20,
    plots=True
)

New https://pypi.org/project/ultralytics/8.3.58 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.40 🚀 Python-3.12.7 torch-2.3.0 CPU (Intel Core(TM) i7-8750H 2.20GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11m.pt, data=datasets/yolo_data/data.yml, epochs=100, time=None, patience=20, batch=16, imgsz=640, save=True, save_period=-1, cache=True, device=cpu, workers=2, project=None, name=train22, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=Fa

[34m[1mtrain: [0mScanning /home/jiggle/personal/aio2024/hw/projects/scene-text-recognition/notebooks/datasets/yolo_data/train/labels.cache... 200 images, 7 backgrounds, 0 corrupt: 100%|██████████| 200/200 [00:00<?, ?it/s]




[34m[1mtrain: [0mCaching images (0.2GB RAM): 100%|██████████| 200/200 [00:00<00:00, 445.91it/s]
[34m[1mval: [0mScanning /home/jiggle/personal/aio2024/hw/projects/scene-text-recognition/notebooks/datasets/yolo_data/val/labels.cache... 7 images, 0 backgrounds, 0 corrupt: 100%|██████████| 7/7 [00:00<?, ?it/s]




[34m[1mval: [0mCaching images (0.0GB RAM): 100%|██████████| 7/7 [00:00<00:00, 151.75it/s]


Plotting labels to runs/detect/train22/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 106 weight(decay=0.0), 113 weight(decay=0.0005), 112 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train22[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  0%|          | 0/13 [00:00<?, ?it/s]

In [None]:
from ultralytics import YOLO

model_path = './runs/detect/train/weights/best.pt'
model = YOLO(model_path)

metrics = model.eval()