In [1]:
import os
import shutil
import random
import cv2
import numpy as np
from tqdm import tqdm
import time

img_path = '/content/drive/MyDrive/Phase2_Results'
label_path = '/content/drive/MyDrive/AOLP_Dataset'
two_plates_file = '/content/drive/MyDrive/AOLP_Dataset/Subset_LE/two_plates.txt'

output_dir = '/content/drive/MyDrive/AOLP_YOLO'
subsets = ["Subset_AC", "Subset_LE", "Subset_RP"]

# Split ratios (test ratio is the remainder)
train = 0.7
val = 0.2

# Helper functions

def convert_box(size, box):
    """
      Converts top-left/bottom-right to YOLO (x_center, y_center, width, height) normalised
      size: (width, height) of image
      box: (x_min, y_min, x_max, y_max)
    """
    dw = 1. / size[0]
    dh = 1. / size[1]

    x_center = (box[0] + box[2]) / 2.0
    y_center = (box[1] + box[3]) / 2.0
    w = box[2] - box[0]
    h = box[3] - box[1]

    x_center = x_center * dw
    w = w * dw
    y_center = y_center * dh
    h = h * dh
    return (x_center, y_center, w, h)

# def read_groundtruth(txt_path):
#     """Reads the 4-line coordinate file."""
#     if not os.path.exists(txt_path):
#         return None
#     with open(txt_path, 'r') as f:
#         lines = f.readlines()
#         # Parse lines, handling scientific notation if present
#         try:
#             coords = [float(x.strip()) for x in lines]
#             # Format is: x1, y1, x2, y2
#             return [coords[0], coords[1], coords[2], coords[3]]
#         except ValueError:
#             print(f"Error parsing {txt_path}")
#             return None

def read_groundtruth(txt_path):
    """
      Reads the 4 line coordinate file (x1, y1, x2, y2)
    """
    if not os.path.exists(txt_path):
        return None
    with open(txt_path, 'r') as f:
        # Filter out empty lines just in case
        lines = [l.strip() for l in f.readlines() if l.strip()]

        try:
            coords = [float(x) for x in lines]

            # To confirm we actually got 4 coordinates
            if len(coords) < 4:
                print(f"Skipping {txt_path}: Found {len(coords)} lines, expected 4.")
                return None

            x1, y1, x2, y2 = coords[0], coords[1], coords[2], coords[3]

            # Logic checking: Ensure x2 > x1 and y2 > y1 (positive width/height)
            # There's chances that the data can be malformed
            if x2 <= x1 or y2 <= y1:
                # Fix swapped coordinates
                x1, x2 = min(x1, x2), max(x1, x2)
                y1, y2 = min(y1, y2), max(y1, y2)

            return [x1, y1, x2, y2]

        except ValueError:
            print(f"Error parsing numbers in {txt_path}")
            return None

# Main method
def main():
    # Load two plates list
    double_plates_ids = set()
    if os.path.exists(two_plates_file):
        with open(two_plates_file, 'r') as f:
            # Source 2 format implies simple list of IDs
            lines = f.readlines()
            for line in lines:
                clean_id = line.strip().split()[0] # Take first part if there's extra text
                if clean_id.isdigit():
                    double_plates_ids.add(clean_id)
    else:
        print("two_plates.txt not found, assuming no double plates")

    # Collect all data entries
    # Format: {'src_img': path, 'src_labels': [path1, path2], 'dst_name': string}
    dataset = []

    for subset in subsets:
        img_dir = os.path.join(img_path, subset)
        # Handle naming difference, image folder is Subset_AC, groundtruth is in Subset_AC/groundtruth_localization
        lbl_dir = os.path.join(label_path, subset, "groundtruth_localization")

        if not os.path.exists(img_dir):
            print(f"Skipping {subset}, path not found: {img_dir}")
            continue

        images = [f for f in os.listdir(img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        for img_file in images:
            file_id = os.path.splitext(img_file)[0]

            src_img_path = os.path.join(img_dir, img_file)

            # Identify Label Files
            label_files = []

            # Check for standard label
            std_lbl_path = os.path.join(lbl_dir, f"{file_id}.txt")
            if os.path.exists(std_lbl_path):
                label_files.append(std_lbl_path)

            # Check for double plates (Only for LE has double plates cases)
            if subset == "Subset_LE" and file_id in double_plates_ids:
                second_lbl_path = os.path.join(lbl_dir, f"{file_id}_2.txt")
                if os.path.exists(second_lbl_path):
                    label_files.append(second_lbl_path)

            if not label_files:
                # Some images might not have labels (negatives), handle as background
                pass

            dst_filename = f"{subset}_{img_file}"

            dataset.append({
                'src_img': src_img_path,
                'labels': label_files,
                'dst_name': dst_filename
            })

    # Shuffle and split
    random.seed(42)
    random.shuffle(dataset)

    total_count = len(dataset)
    train_end = int(total_count * train)
    val_end = train_end + int(total_count * val)

    splits = {
        'train': dataset[:train_end],
        'val': dataset[train_end:val_end],
        'test': dataset[val_end:]
    }

    for split_name, split_data in splits.items():
        img_save_dir = os.path.join(output_dir, "images", split_name)
        lbl_save_dir = os.path.join(output_dir, "labels", split_name)
        os.makedirs(img_save_dir, exist_ok=True)
        os.makedirs(lbl_save_dir, exist_ok=True)

        print(f"Processing {split_name}: {len(split_data)} images...")

        for entry in tqdm(split_data):
            # Read Image to get dimensions
            img = cv2.imread(entry['src_img'])
            if img is None:
              continue
            height, width = img.shape[:2]

            # Copy Image with retry mechanism
            copy_succeeded = False
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    shutil.copy(entry['src_img'], os.path.join(img_save_dir, entry['dst_name']))
                    copy_succeeded = True
                    break
                except FileNotFoundError as e:
                    print(f"Warning: FileNotFoundError during copy of {entry['src_img']} to {os.path.join(img_save_dir, entry['dst_name'])}. Attempt {attempt + 1}/{max_retries}.")
                    if attempt < max_retries - 1:
                        time.sleep(1) # Wait a bit before retry
                    else:
                        print(f"Error: Failed to copy {entry['src_img']} after {max_retries} attempts. Skipping this image and its labels.")

            if not copy_succeeded:
                continue # Skip to the next entry if image copy fail

            # Process Labels
            yolo_lines = []
            for lbl_path in entry['labels']:
                box_coords = read_groundtruth(lbl_path) # [x1, y1, x2, y2]
                if box_coords:
                    # Convert to YOLO (x_c, y_c, w, h)
                    yolo_box = convert_box((width, height), box_coords)
                    # Class ID is 0 (license_plate)
                    yolo_lines.append(f"0 {yolo_box[0]:.6f} {yolo_box[1]:.6f} {yolo_box[2]:.6f} {yolo_box[3]:.6f}")

            # Write label file
            dst_lbl_name = os.path.splitext(entry['dst_name'])[0] + ".txt"
            with open(os.path.join(lbl_save_dir, dst_lbl_name), 'w') as f:
                f.write('\n'.join(yolo_lines))

    print(f"\nDataset created at {output_dir}")
    print(f"Structure:")
    print(f"Train: {len(splits['train'])}")
    print(f"Val:   {len(splits['val'])}")
    print(f"Test:  {len(splits['test'])}")

    # Create data.yaml file
    yaml_content = f"""
path: /kaggle/input/aolp-yolo/AOLP_YOLO
train: images/train
val: images/val
test: images/test

nc: 1
names: ['license_plate']
    """

    with open(os.path.join(output_dir, "data.yaml"), "w") as f:
        f.write(yaml_content)


main()

Processing train: 1434 images...


100%|██████████| 1434/1434 [28:12<00:00,  1.18s/it]


Processing val: 409 images...


100%|██████████| 409/409 [07:36<00:00,  1.12s/it]


Processing test: 206 images...


100%|██████████| 206/206 [03:52<00:00,  1.13s/it]



Dataset created at /content/drive/MyDrive/AOLP_YOLO
Structure:
Train: 1434
Val:   409
Test:  206


In [2]:
!find /content/drive/MyDrive/AOLP_YOLO -type f -iname "*.jpg" | wc -l
!find /content/drive/MyDrive/AOLP_YOLO -type f -iname "*.txt" | wc -l

2049
2049
