In [2]:
import os

import numpy as np
import cv2
import supervision as sv
from autodistill.detection import CaptionOntology
from autodistill_grounded_sam import GroundedSAM
# from autodistill_yolov8 import YOLOv8
from ultralytics import YOLO
from IPython.display import Image
import matplotlib.pyplot as plt

import video_multiprocessing as vm
import superimpose_boxes as sb

HOME = os.getcwd()
HOME

'/Users/eric/Desktop/2-Career/Projects/ObjectDetection'

In [3]:
project = 'dog_park'
# project = 'license_plates'
# project = 'hockey'

In [4]:
VIDEO_DIR_PATH = f"{HOME}/{project}/videos"
IMAGE_DIR_PATH = f"{HOME}/{project}/images"
DATASET_DIR_PATH = f"{HOME}/{project}/dataset"
DATASET_TRAIN_DIR_PATH = f"{DATASET_DIR_PATH}/train"
DATASET_VALID_DIR_PATH = f"{DATASET_DIR_PATH}/valid"
os.makedirs(DATASET_DIR_PATH, exist_ok=True)
os.makedirs(DATASET_TRAIN_DIR_PATH, exist_ok=True)
os.makedirs(DATASET_VALID_DIR_PATH, exist_ok=True)
ANNOTATIONS_TRAIN_DIRECTORY_PATH = f"{DATASET_TRAIN_DIR_PATH}/labels"
ANNOTATIONS_VALID_DIRECTORY_PATH = f"{DATASET_VALID_DIR_PATH}/labels"
IMAGES_TRAIN_DIRECTORY_PATH = f"{DATASET_TRAIN_DIR_PATH}/images"
IMAGES_VALID_DIRECTORY_PATH = f"{DATASET_VALID_DIR_PATH}/images"
SAM_DATA_YAML_PATH = f"{DATASET_DIR_PATH}/data.yaml" # sam_data_yaml includes more labels than yolo_data_yaml to assist SAM with detecting license plates
YOLO_DATA_YAML_PATH = f"{DATASET_DIR_PATH}/data.yaml" # yolo_data_yaml includes the consolidated labels for the yolo model training
ANNOTATIONS_TRAIN_FILTERED_DIRECTORY_PATH = f"{DATASET_TRAIN_DIR_PATH}/labels_filtered"
ANNOTATIONS_VALID_FILTERED_DIRECTORY_PATH = f"{DATASET_VALID_DIR_PATH}/labels_filtered"
SUPERIMPOSED_ANNOTATIONS_TRAIN_DIRECTORY_PATH = f"{DATASET_TRAIN_DIR_PATH}/superimposed_annotations"
SUPERIMPOSED_ANNOTATIONS_VALID_DIRECTORY_PATH = f"{DATASET_VALID_DIR_PATH}/superimposed_annotations"


In [5]:
def rename_videos(VIDEO_DIR_PATH):
    for i, video_name in enumerate(os.listdir(VIDEO_DIR_PATH)):
        video_path = f"{VIDEO_DIR_PATH}/{video_name}"
        os.rename(video_path, f"{VIDEO_DIR_PATH}/{i}.mp4")

# do the same for generic files with unknown extension
def rename_files(path):
    # TODO: Haven't tested this yet
    for i, file_name in enumerate(os.listdir(path)):
        file_path = f"{path}/{file_name}"
        # get the extension of the file
        file_extension = file_name.split(".")[-1]
        # os.rename(file_path, f"{path}/{i}.{file_extension}")
        print(file_path, f"{path}/{i}.{file_extension}")

### Convert videos into images

The code below saves every `nth` frame from each video using the `FRAME_STRIDE` parameter. The code for this had to be placed in it's own Python file, [video_multiprocessing.py](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/video_multiprocessing.py), in order for the multiprocessing pool to work properly.

In [6]:
FRAME_STRIDE = 1
video_paths = sv.list_files_with_extensions(
    directory=VIDEO_DIR_PATH,
    extensions=["mov", "mp4"]
)

In [7]:
# Assuming you have already split your video paths
# TEST_VIDEO_PATHS, TRAIN_VIDEO_PATHS = video_paths[:2], video_paths[2:]
TRAIN_VIDEO_PATHS = video_paths

split_videos = False
if split_videos:
    # Now call the function to process videos in parallel
    vm.process_videos_in_parallel(video_paths=TRAIN_VIDEO_PATHS, image_dir_path=IMAGE_DIR_PATH, frame_stride=FRAME_STRIDE)
    # find the names of any files that may be missing for whatever reason
    for i in range(0, len(os.listdir(VIDEO_DIR_PATH))):
        if f"{i}-00000.png" not in os.listdir(IMAGE_DIR_PATH):
            print(f"{i}.png not found")
# count the number of images in the video and image directories
len(os.listdir(VIDEO_DIR_PATH)) , len(os.listdir(IMAGE_DIR_PATH)) 

(15, 330)

## Annotate the dataset

**TASK**: Create a labeled dataset from a directory of images using Meta's *Distillation with No Labels* (DINO) and *Segment Anything Model* (SAM)

**this is not correct. SAM is not multimodal. MULTIMODAL**: SAM is a multimodal model, which means it can accept multiple types of input. In this case, SAM accepts both images and text captions as input. SAM will use its LLM capabilities to understand the text captions and segment the images.

**WHY SAM?**: Before the release of multimodal models, computer vision experts would have to manully annotate thousands of images with special care for their deep learning models. This involved drawing boxes around pixels of interest, being careful to all of the pixels that include the object of interest while minimizing the amount of pixels in the background. Special considerations also needed to be made for objects of interest that appeared partially out of frame and for partially occluded objects of interest in the image. Each carefully drawn bounding polygon is then labeled from the set of classifications that you are hoping to train the target model to detect. This labor intensive process used to be a major obstacle for custom object detection models, and it is something I've had the pleasure of doing as recently as March of 2023, which was the release date of Meta's SAM. So to save time and moeny, we will use SAM as a Base Model to help create a labeled dataset.

**ONTOLOGY**: An Ontology defines how your Base Model is prompted, what your Dataset will describe, and what your Target Model will predict. A simple Ontology is the CaptionOntology which prompts a Base Model with text captions and maps them to class names. Other Ontologies may, for instance, use a CLIP vector or example images instead of a text caption.

**BASE MODEL**: A Base Model is a large foundation model that knows a lot about a lot. Base models are often multimodal and can perform many tasks. They're large, slow, and expensive. Examples of Base Models are Meta's SAM and OpenAI's GPT-4 multimodal variant.

**LABELED DATASET**: The output from the Base Model is a labeled dataset, which is a set of auto-labeled images. First we'll review the data for mistakes; we can delete incorrectly labeled images from the dataset or we can use an annotation tool to edit the labels. Examples of annotation tools include: LabelMe, LabelImg, CVAT, SuperAnnotate, ScaleAI, Roboflow Annotator, and many more. 

**TARGET MODEL**: A Target Model is a model that is trained on a labeled dataset to perform a specific task. Target Models are often small, fast, cheap, and fine-tuned to perform a specific task very well, but they don't generalize well beyond the information described in their dataset. Examples of Target Models are YOLO and DETR. Once we have corrected all mistakes in our labeled dataset, then it is safe to proceed with training our Target Model - in this case YOLOv8.

In [8]:
annotate = False
if annotate:
    labels_dict = {
        "The license plate on the motor vehicle": "license plate", #0
        # "license plate": "license plate", #0
        # "number plate": "license plate", #1
        # "registration plate": "license plate", #2
        # "license": "license plate", #3
        # "license plate number": "license plate", #4
        # "numberplate": "license plate", #5
    }
    ontology=CaptionOntology(labels_dict)

    base_model = GroundedSAM(ontology=ontology)
    dataset = base_model.label(
        input_folder=IMAGE_DIR_PATH,
        extension=".png",
        output_folder=DATASET_DIR_PATH)

    len(dataset)
else:
    os.makedirs(ANNOTATIONS_TRAIN_DIRECTORY_PATH, exist_ok=True)
    os.makedirs(ANNOTATIONS_VALID_DIRECTORY_PATH, exist_ok=True)
    os.makedirs(IMAGES_TRAIN_DIRECTORY_PATH, exist_ok=True)
    os.makedirs(IMAGES_VALID_DIRECTORY_PATH, exist_ok=True)
#     with open(YOLO_DATA_YAML_PATH, "w") as f:
#         sam_data_yaml = f.write('''names:
# - license plate
# nc: 1
# train: /Users/eric/Desktop/2-Career/Projects/ObjectDetection/dataset/train/images
# val: /Users/eric/Desktop/2-Career/Projects/ObjectDetection/dataset/valid/images
# ''')

### Visualize the ontologies that worked best

Assuming more than one ontology was used

In [9]:
if annotate:
    labels_list = list(labels_dict.items())
    labels_path = f'{DATASET_TRAIN_DIR_PATH}/labels'
    for filename in os.listdir(labels_path):
        with open(os.path.join(labels_path, filename), 'r') as f:
            contents = f.readlines()
            labels = []
            for line in contents:
                label_str = line.split(' ')[0]
                mask = line[len(label_str):].strip().split(' ')
                num_segmentation_points = len(mask) // 2
                label_int = int(label_str)
                labels += [(labels_list[label_int][0], labels_list[label_int][1], num_segmentation_points)]
            if len(contents) > 0:
                print(filename, len(contents), labels)

    histogram = {}
    for filename in os.listdir(labels_path):
        with open(os.path.join(labels_path, filename), 'r') as f:
            contents = f.readlines()
            for line in contents:
                label_str = line.split(' ')[0]
                mask = line[len(label_str):].strip().split(' ')
                num_segmentation_points = len(mask) // 2
                label_int = int(label_str)
                label_name = labels_list[label_int][0]
                if label_name not in histogram:
                    histogram[label_name] = 0
                histogram[label_name] += 1

    # plot this histogram
    import matplotlib.pyplot as plt
    plt.bar(range(len(histogram)), list(histogram.values()), align='center')
    plt.xticks(range(len(histogram)), list(histogram.keys()))
    plt.xlabel('Ontology key words')
    plt.ylabel('Number of instances')
    plt.show()

## Review the Annotations

**TASK**: For the sake of time, we'll simply delete any incorrectly annotated images from the labeled dataset. This will take some manual effort, but much less effort than editing masks, which is something to consider if your dataset is small. 

**SUPERIMPOSE ANNOTATIONS FOR REVIEW**: The SAM model has placed all of the images in the `dataset/train/images` directory and the corresponding labels in the `dataset/train/labels` directory. Using Python, I'll superimpose the labels onto the image and save them to a new directory, `dataset/train/superimposed_annotations` so I can visually inspect the annotations. 


In [10]:
def calculate_iou(mask1, mask2):
    '''UNUSED: Calculates Intersection over Union (IoU) for two binary masks'''
    intersection = np.logical_and(mask1, mask2)
    union = np.logical_or(mask1, mask2)
    iou = np.sum(intersection) / np.sum(union)
    return iou

def are_duplicates_based_on_overlap(mask1, mask2, threshold=0.95):
    '''Calculates the mutual overlap for two binary masks and returns a boolean indicating whether they are duplicates'''
    intersection = np.logical_and(mask1, mask2)
    overlap_area_1 = np.sum(intersection) / np.sum(mask1)
    overlap_area_2 = np.sum(intersection) / np.sum(mask2)
    return overlap_area_1 >= threshold and overlap_area_2 >= threshold

def parse_line(line):
    '''Parses a line and returns the original label and coordinates'''
    parts = line.split()
    original_label = int(parts[0])
    coordinates = np.array([float(coord) for coord in parts[1:]]).reshape(-1, 2)
    return original_label, coordinates

def create_mask(coordinates, image_dims):
    # Scale the normalized coordinates to the image dimensions
    scaled_coords = coordinates * np.array(image_dims)
    # Ensure the coordinates are integers, as required by cv2.fillPoly
    scaled_coords = np.round(scaled_coords).astype(np.int32)
    # Create an empty black image and fill it with white in the polygon
    mask = np.zeros(image_dims, dtype=np.uint8)
    cv2.fillPoly(mask, [scaled_coords], color=1)
    return mask

# Read mask lines from the input file
def consolidate_and_correct_masks(annotations_path, annotations_filtered_path):
    # Define the new label groups based on original labels
    label_groups = {
        0: list(range(0, 1)),  # Labels 0-n will be consolidated into new label 0
        # 1: list(range(1, 5)),  # Labels n+1-m will be consolidated into new label 1 for another label
    }

    # Reverse mapping for convenience: original label to new label
    original_to_new_label = {original: new for new, originals in label_groups.items() for original in originals}
    threshold = 0.95  # Threshold for considering masks as duplicates. # HACK: This is completely problem-specific.
    image_dims = (2160, 3840)  # Dimensions of the images in the dataset # HACK: This won't fly with variable image dimensions. To improve, read the image dimensions from the image files themselves using cv2.imread.
    for file in os.listdir(annotations_path):
        if not file.endswith('.txt'):
            continue
        input_file_path = f'{annotations_path}/{file}'
        with open(input_file_path, 'r') as f:
            mask_lines = f.readlines()

        unique_mask_lines = []

        for line_i in mask_lines:
            original_label_i, coords_i = parse_line(line_i)
            new_label_i = original_to_new_label.get(original_label_i)
            mask_i = create_mask(coords_i, image_dims)
            is_duplicate = False
            for line_j in unique_mask_lines:
                new_label_j, coords_j = parse_line(line_j)
                if new_label_j == new_label_i:  # Compare only within the same new label group
                    mask_j = create_mask(coords_j, image_dims)
                    if are_duplicates_based_on_overlap(mask_i, mask_j, threshold=threshold):
                        is_duplicate = True
                        break # break to avoid adding the duplicate mask to the unique mask lines
            if not is_duplicate:
                mask_area = np.sum(mask_i)
                if mask_area < 2000: # HACK: This is completely problem-specific. In this case, we're looking for license plates, and after investigating, I found that their mask area is typically well below 10000.
                    unique_mask_lines.append(f"{new_label_i} {' '.join(map(str, coords_i.flatten()))}")

        # Write the consolidated mask data with new labels to the output file
        output_file_path = f'{annotations_filtered_path}/{file}'
        with open(output_file_path, 'w') as f:
            for i, line in enumerate(unique_mask_lines):
                if i == len(unique_mask_lines) - 1:
                    f.write(line)
                else:
                    f.write(f'{line}\n')

In [11]:
superimpose = False
if superimpose:
    IMAGE_DIR = 'dataset/train/images'
    BBOX_DIR = 'dataset/train/labels'
    OUTPUT_DIR = 'dataset/train/superimposed_annotations'
    sb.superimpose_boxes(IMAGE_DIR, BBOX_DIR, OUTPUT_DIR)

🛑**MANUAL VISUAL INSPECTION**🛑: To review the annotations, I'll go into Finder on my Mac, preview the files one-by-one with the spacebar and arrow keys, and simply delete the image file that have mistakes. Don't skip this step! The quality of your dataset will determine the quality of your model; tedious as it may be, it's worth the effort to ensure your dataset is clean.

In [12]:
# inspection_complete = input("Have you inspected the annotations? (y/n): ")
inspection_complete = 'y'
if inspection_complete == "y":
    print("Great! Let's move on.")
else:
    print("Please inspect the annotations before continuing.")

Great! Let's move on.


**REMOVE MISTAKES**: Now that we've decided which image files to remove, we must reflect these changes in the folders that are actually used to train YOLO. We'll use Python to programatically identify those filenames that need to be removed from both the `dataset/train/images` and `dataset/train/labels` directories. 

In [13]:
if annotate:
    for image_path, annotations_path, _, superimposed_annotations_path in [train_paths, valid_paths]:
        superimposed_annotation_file_names = os.listdir(superimposed_annotations_path)
        image_file_names = os.listdir(image_path)
        # find the names of any files that do not exist in the superimposed annotations directory, but do exist in the images directory, and delete them from both the images and labels directories.
        superimposed_annotation_file_names_copy = superimposed_annotation_file_names.copy()
        count = 0
        deleted = []
        for image_file_name in image_file_names:
            # print(f"{image_file_name}")
            if f"{image_file_name.split('.')[0]}.jpg" in superimposed_annotation_file_names_copy:
                # print(image_file_name)
                superimposed_annotation_file_names_copy.remove(f"{image_file_name.split('.')[0]}.jpg")
                continue
            else:
                count += 1
                os.remove(f"{image_path}/{image_file_name}") 
                os.remove(f"{annotations_path}/{image_file_name[:-4]}.txt")
                deleted.append(image_file_name.split('.')[0])
        print(f"{count} images deleted from {image_path} and their corresponding label files from {annotations_path}.") 
        print(f'{len(os.listdir(image_path))} images remain in {image_path} and {len(os.listdir(annotations_path))} label files remain in {annotations_path}.')
        print(f'Deleted Files: {deleted}')

## Train the target model

**START TRAINING**: Finally, once the annotations are pristine, we can proceed with training our Target Model, YOLOv8. We'll take the pre-trained weights from the YOLOv8 model, [yolov8m.pt](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/yolov8m.pt), and fine-tune them on our labeled dataset. The training session will save the weights to a couple new files, [runs/detect/train/weights/best.pt](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/runs/detect/train/weights/best.pt), which is the best weights file, and [runs/detect/train/weights/last.pt](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/runs/detect/train/weights/last.pt), which is the most recent weights file. 

We'll use the [best.pt](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/runs/detect/train/weights/best.pt) weights file to demonstrate its inference capabilities on our test set, and we can use the [last.pt](/Users/eric/Desktop/2-Career/Projects/ObjectDetectionLL/runs/detect/train/weights/last.pt) weights file to resume training in place of yolov8m.pt if we want to make it more robust with additional epochs and/or data.

In [14]:
# See the proportion of images with no annotations (True Negatives)
for folder in ['train', 'valid']:
    image_dir = f'{project}/dataset/{folder}/images'
    image_filenames = os.listdir(image_dir)
    label_dir = f'{project}/dataset/{folder}/labels'
    label_filenames = os.listdir(label_dir)
    num_empty = 0
    for label_filename in label_filenames:
        with open(os.path.join(label_dir, label_filename), 'r') as file:
            contents = file.read()
            if not contents:
                num_empty += 1
    print(f'Percentage of True Negatives (Backgrounds without LPs) in {folder} set: {num_empty / len(label_filenames) * 100:.2f}% of {len(label_filenames)} images')

Percentage of True Negatives (Backgrounds without LPs) in train set: 0.00% of 295 images
Percentage of True Negatives (Backgrounds without LPs) in valid set: 0.00% of 95 images


In [15]:
from datetime import datetime
datetime_now = datetime.now().strftime("%Y.%m.%d_%H.%M.%S")
start_from_scratch = False

model_size = 'n'
epochs = 100
device = 'mps'
imgsz = 1080 # Check images in letterbox experiment to see how small we can go before the license plates become unrecognizable.
imgsz = int(32 * round(imgsz / 32)) # ensure imgsz is a multiple of 32
seed = None # set the seed for reproducibility
batch = None # higher increases training speed, but may reduce mAP accuracy. Yann LeCun recommends <=32.
save_dir = f'train_{project}_{datetime_now}_{model_size}_im{imgsz}_ep{epochs}_ba{batch}_se{seed}'

yml = YOLO_DATA_YAML_PATH
if start_from_scratch:
    target_model = YOLO(f"yolov8{model_size}.pt")
else:
    if project == 'dog_park':
        latest_train_dir = 'train_dogpark_2023.12.26_17.08.07_n_im640_ep100_baNone_seNone'
    elif project == 'license_plates':
        latest_train_dir = 'train_LP_2023.12.26_14.22.25_n_im640_ep20_baNone_seNone'
    elif project == 'hockey':
        latest_train_dir = 'train_hockey_2023.12.06_14.04.17_n_im640_ep1_baNone_seNone'
    else:
        latest_train_dir = input("Enter the name of the latest training directory: ")

    target_model = YOLO(f'{HOME}/runs/detect/{latest_train_dir}/weights/last.pt')

if seed is None:
    # target_model.train(data=yml, epochs=epochs, device=device, imgsz=imgsz, batch=batch, name=save_dir)
    target_model.train(data=yml, epochs=epochs, device=device, imgsz=imgsz, name=save_dir)
else:
    target_model.train(data=yml, epochs=epochs, device=device, imgsz=imgsz, seed=seed, batch=batch, name=save_dir)



New https://pypi.org/project/ultralytics/8.0.230 available 😃 Update with 'pip install -U ultralytics'
'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'
Ultralytics YOLOv8.0.81 🚀 Python-3.10.12 torch-2.1.0 MPS
[34m[1myolo/engine/trainer: [0mtask=detect, mode=train, model=/Users/eric/Desktop/2-Career/Projects/ObjectDetection/runs/detect/train_dogpark_2023.12.26_17.08.07_n_im640_ep100_baNone_seNone/weights/last.pt, data=/Users/eric/Desktop/2-Career/Projects/ObjectDetection/dog_park/dataset/data.yaml, epochs=100, patience=50, batch=16, imgsz=1088, save=True, save_period=-1, cache=False, device=mps, workers=8, project=None, name=train_dog_park_2023.12.27_11.39.50_n_im1088_ep100_baNone_seNone, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, image_weights=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, sa

In [None]:
train_folder = sorted(os.listdir(f"{HOME}/runs/detect/"))
max_number = 0
for file in train_folder:
    if 'train' in file:
        number = int(file.replace('train', '0'))
        if number > max_number:
            max_number = number
if max_number == 0:
    max_number = ''
latest_train_dir = f'train{max_number}'
latest_train_dir

ValueError: invalid literal for int() with base 10: 'dog_park_0_2023.12.26_12.50.03_n_im640_ep20_baNone_seNone'

In [None]:
!ls {HOME}/runs/detect/{latest_train_dir}/

## Evaluate target model

In [None]:
Image(filename=f'{HOME}/runs/detect/{latest_train_dir}/confusion_matrix.png', width=600)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/eric/Desktop/2-Career/Projects/ObjectDetection/runs/detect/train_2023.12.05_17.00.11_n_im640_ep64_baNone_seNone/confusion_matrix.png'

In [None]:
Image(filename=f'{HOME}/runs/detect/{latest_train_dir}/results.png', width=600)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/eric/Desktop/2-Career/Projects/ObjectDetection/runs/detect/train_2023.12.05_17.00.11_n_im640_ep64_baNone_seNone/results.png'

In [None]:
Image(filename=f'{HOME}/runs/detect/{latest_train_dir}/val_batch0_pred.jpg', width=600)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/eric/Desktop/2-Career/Projects/ObjectDetection/runs/detect/train_2023.12.05_17.00.11_n_im640_ep64_baNone_seNone/val_batch0_pred.jpg'