# Train YOLO and a Segmentation Model for road markings on Images from self-generated Data

In [None]:
%pip install opencv-python ultralytics

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2

In [None]:
def preprocess(image, dims=(512, 512)):
    image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    
    # crop image to a centered square
    min_dim = min(image.shape[0], image.shape[1])
    start_x = (image.shape[1] - min_dim) // 2
    start_y = (image.shape[0] - min_dim) // 2
    image = image[start_y:start_y+min_dim, start_x:start_x+min_dim]

    image = cv2.resize(image, dims)
    return image

### Generate Dataset by splitting video into frames

In [None]:
input('Are you sure you want to regenerate the frames? Press Enter to continue...')
print()

videos_path = 'data/traffic-signs-and-road-markings'
video_names = ['video1.mp4', 'video2.mp4']
skip_frames = 20
output_path = 'data/traffic-signs-and-road-markings/frames'

if os.path.exists(output_path):
    print('Deleting existing frames...')
    for file_name in os.listdir(output_path):
        os.remove(os.path.join(output_path, file_name))
else:
    print('Creating output directory...')
    os.makedirs(output_path, exist_ok=True)

print('Extracting frames...')
for video_name in video_names:
    video_path = os.path.join(videos_path, video_name)
    capture = cv2.VideoCapture(video_path)
    fps = capture.get(cv2.CAP_PROP_FPS)
    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f'Video: {video_name}')
    print(f'FPS: {fps}')
    print(f'Size: {width}x{height}')

    frame_count = 0
    saved_frames = 0
    while True:
        success, frame = capture.read()
        print(f'Saved frames: {saved_frames}', end='\r')
        if not success:
            break
        
        if frame_count % skip_frames == 0:
            saved_frames += 1
            cv2.imwrite(os.path.join(output_path, f'{video_name.split(".")[0]}_{str(saved_frames).zfill(3)}.jpg'), preprocess(frame))
                
        frame_count += 1
    
    print(f'Successfully saved {saved_frames} frames from video {video_name} ({frame_count} frames in total)')
    print()



### Load data

In [None]:
import json
import re

dataset_root = 'data/traffic-signs-and-road-markings'
dataset_path = 'data/traffic-signs-and-road-markings/sr-20-annotated'
images_path = os.path.join(dataset_path, 'images')

with open(os.path.join(dataset_path, 'localization-classes.txt'), 'r') as file:
    localization_classes = file.read().splitlines()

with open(os.path.join(dataset_path, 'segmentation-classes.txt'), 'r') as file:
    segmentation_classes = file.read().splitlines()

with open(os.path.join(dataset_path, 'localization-classes-german.txt'), 'r', encoding='utf-8') as file:
    localization_classes_german = file.read().splitlines()

with open(os.path.join(dataset_path, 'segmentation-classes-german.txt'), 'r', encoding='utf-8') as file:
    segmentation_classes_german = file.read().splitlines()

print('Localization classes:', localization_classes)
print('Segmentation classes:', segmentation_classes)

with open(os.path.join(dataset_path, 'localization.json'), 'r') as file:
    localization_data_list = json.load(file)

with open(os.path.join(dataset_path, 'segmentation.json'), 'r') as file:
    segmentation_data_list = json.load(file)

localization_data = {}
for entry in localization_data_list:
    image_name = entry['data']['image'].split('%5C')[-1]
    localization_data[image_name] = entry

segmentation_data = {}
for entry in segmentation_data_list:
    image_name = entry['data']['image'].split('%5C')[-1]
    segmentation_data[image_name] = entry

image_width = 512
image_height = 512
             
data = []

for image_name in os.listdir(images_path):
    image_path = os.path.join(images_path, image_name)
    image = plt.imread(image_path)
    
    if image.shape[0] != image_height or image.shape[1] != image_width:
        print(f'Invalid image size: {image.shape} (expected: {image_height}x{image_width})')
        continue
    
    localization = localization_data[image_name]
    bounding_boxes = []
    for annotation in localization['annotations']:
        result = annotation['result']
        if not result:
            continue

        for entry in result:
            value = entry['value']
            label = value['rectanglelabels'][0]
            if label not in localization_classes:
                print(f'Unknown localization class: {label}')
                continue
                
            class_index = localization_classes.index(label)
            x_center = (value['x'] + value['width'] / 2) / 100
            y_center = (value['y'] + value['height'] / 2) / 100
            width = value['width'] / 100
            height = value['height'] / 100
            bounding_boxes.append([class_index, x_center, y_center, width, height])

    # create binary mask for each segmentation class from the polygon data
    segmentation = segmentation_data[image_name]
    masks = np.zeros((len(segmentation_classes), image_width, image_height), dtype=np.uint8)
    for annotation in segmentation['annotations']:
        result = annotation['result']
        if not result:
            continue

        for entry in result:
            value = entry['value']
            label = value['polygonlabels'][0]
            if label not in segmentation_classes:
                print(f'Unknown segmentation class: {label}')
                continue

            class_index = segmentation_classes.index(label)
            points = value['points']
            # points are given as percentages of the image size
            polygon = np.array([[int(p[0] / 100 * image_width), int(p[1] / 100 * image_height)] for p in points], dtype=np.int32)
            cv2.fillPoly(masks[class_index], [polygon], 1)

    data.append({
        'name': image_name,
        'image': image,
        'bounding_boxes': bounding_boxes,
        'segmentation_masks': masks
    })

# sort by frame because i had unlucky naming when labeling the data for the first time
data.sort(key=lambda x: [int(n) if n.isdigit() else n for n in re.split(r'(\d+)', x['name'])])

print([d['name'] for d in data])
print(f'Successfully loaded {len(data)} data entries')


### Visualization

In [None]:
from matplotlib.patches import Patch

colors = [
    [0.1, 0.2, 0.5],
    [0.8, 0.1, 0.1],
    [0.1, 0.6, 0.1],
    [0.6, 0.1, 0.6],
    [0.1, 0.6, 0.6],
]

def show_overlayed_masks(ax: plt.axes, masks: list[np.ndarray], alpha: float = 0.3, legend_fontsize: int = 6, legend_title_fontsize: int = 8):
    width, height = masks[0].shape
    surrounding_mask = np.ones((width, height), dtype=np.uint8)
    for mask in masks:
        surrounding_mask[mask > 0] = 0

    display_masks = [*masks, surrounding_mask]
    for i, mask in enumerate(display_masks):
        overlay = np.zeros((mask.shape[0], mask.shape[1], 4), dtype=np.float32)
        overlay[..., :3] = colors[i]  
        overlay[..., 3] = mask
        ax.imshow(overlay, alpha=alpha)
    
    legend_elements = [
        Patch(facecolor=[*colors[i], 0.6], label=[*segmentation_classes_german, 'Umgebung'][i]) for i in range(len(segmentation_classes) + 1)
    ]

    plt.legend(handles=legend_elements, loc='upper right', fontsize=legend_fontsize, title='Segmentierung', title_fontsize=legend_title_fontsize)

def show_bounding_boxes(ax: plt.axes, bounding_boxes: list[list[float]], linewidth: float = 1, fontsize: int = 6):
    for box in bounding_boxes:
        class_index, x_center, y_center, width, height = box
        x = (x_center - width / 2) * image_width
        y = (y_center - height / 2) * image_height
        rect = plt.Rectangle((x, y), width * image_width, height * image_height, linewidth=linewidth, edgecolor=colors[class_index], facecolor='none')
        ax.add_patch(rect)
        ax.text(x + 5, y - 10, localization_classes_german[class_index], color='white', backgroundcolor=[c * 0.5 for c in colors[class_index]], fontsize=fontsize)

def save_data_as_annotated_frames(data: list[dict], folder: str = 'frames', overwrite: bool = False, show_masks: bool = True, show_boxes: bool = True):    
    folder_path = os.path.join('media', folder)
    if os.path.exists(folder_path):
        if not overwrite:
            raise FileExistsError(f'Folder {folder_path} already exists')
        
        for file_name in os.listdir(folder_path):
            os.remove(os.path.join(folder_path, file_name))
    else:
        os.makedirs(folder_path, exist_ok=True)

    for i, entry in enumerate(data):
        fig, ax = plt.subplots()
        ax.imshow(entry['image'])
        ax.axis('off')
        if show_masks:
            show_overlayed_masks(ax, entry['segmentation_masks'])
        if show_boxes:
            show_bounding_boxes(ax, entry['bounding_boxes'])
        frame_path = os.path.join(folder_path, f'frame_{i}.png')
        plt.savefig(frame_path, bbox_inches='tight', pad_inches=0)
        plt.close()
        print(f'Saved frame {i}/{len(data)}', end='\r')

    print(f'Successfully saved {len(data)} frames')

def join_frames_to_video(source_path = 'media/frames', target_path = 'media', video_name = 'video', fps = 2, dims = (512, 512)):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_path = os.path.join(target_path, f'{video_name}.mp4')
    out = cv2.VideoWriter(video_path, fourcc, fps, dims, True)
    files = os.listdir(source_path)

    for i, file in enumerate(files):
        img = cv2.imread(os.path.join(source_path, file))
        if img is None:
            continue

        if img.shape[0] != dims[0] or img.shape[1] != dims[1]:
            img = cv2.resize(img, dims)
        
        out.write(img)
        print(f'Processed frame {i}/{len(files)}', end='\r')

    out.release()
    print(f'Video with {len(files)} frames saved successfully')

def annotate_frame_with_yolo(model: any, frame: any, dims: tuple[int] = (512, 512)):
    results = model.predict(source=preprocess(frame, dims), save=False)
    image = results[0].plot()
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

def annotate_plot_with_segmentation_models(models: list[any], ax: plt.axes, image: any, dims: tuple[int] = (512, 512), **kwargs):
    masks = []
    for model in models:
        predicted_masks = model.predict(np.expand_dims(image, axis=0), verbose=0)[0]
        predicted_masks = np.transpose(predicted_masks, (2, 0, 1))
        predicted_masks = [cv2.resize(mask, dims) for mask in predicted_masks]
        masks.extend(predicted_masks)
    
    show_overlayed_masks(ax, masks, **kwargs)
    
def annotate_video(input_path: str, output_path: str, process_frame: callable, dims: tuple[int] = (512, 512), sr: int = 1, max_frames = None):
    if not os.path.exists(input_path):
        raise OSError('Input path does not exist')
    capture = cv2.VideoCapture(input_path)
    fps = capture.get(cv2.CAP_PROP_FPS)
    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    print('Annotating video')
    print(f'FPS: {fps}')
    print(f'Size: {width}x{height}')

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, int(fps / sr), dims, True)

    frame_count = 0
    saved_frames = 0
    while True:
        success, frame = capture.read()
        if not success:
            break
        
        if frame_count % sr == 0:
            saved_frames += 1
            out.write(process_frame(frame))
            print(f'Processed frames: {saved_frames}', end='\r')
        
        frame_count += 1

        if max_frames is not None and saved_frames >= max_frames:
            break
    
    print('Successfully generated video')
    out.release()

In [None]:
entry = data[75]
fig, ax = plt.subplots()
ax.imshow(entry['image'])
ax.axis('off')
show_overlayed_masks(ax, entry['segmentation_masks'])
# show_bounding_boxes(ax, entry['bounding_boxes'])
# plt.savefig('media/Segmentierung_Beispiel.png', bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
n, m = 2, 3
fig, ax = plt.subplots(n, m, figsize=(m * 3, n * 3))
np.random.shuffle(data)
for x in range(n):
    for y in range(m):
        ax[x, y].axis('off')
        entry = data[y * 10 + x]
        ax[x, y].imshow(entry['image'])
        # show_bounding_boxes(ax[x, y], entry['bounding_boxes'], linewidth=2, fontsize=8)
        show_overlayed_masks(ax[x, y], entry['segmentation_masks'], legend_fontsize=10, legend_title_fontsize=12)

plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.tight_layout()
# plt.savefig('media/Auszug_Eigenes_Datenset_Segmentierung.png', bbox_inches='tight')
plt.show()

In [None]:
entry = data[75]
# 3 images side by side, original, mask 0, mask 1
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
axs[0].imshow(entry['image'])
axs[0].axis('off')
axs[0].set_title('Original')
axs[1].imshow(entry['segmentation_masks'][0], cmap='viridis')
axs[1].set_title('Maske für "Straße"')
axs[1].axis('off')
axs[2].imshow(entry['segmentation_masks'][1], cmap='viridis')
axs[2].set_title('Maske für "Straßenschild"')
axs[2].axis('off')
# plt.savefig('media/Segmentierungsmasken.png', bbox_inches='tight')
plt.show()

### Fine-tune Pretrained YOLO Model
https://docs.ultralytics.com/de/usage/python/\
https://pytorch.org/hub/ultralytics_yolov5/

In [None]:
dataset_path = os.path.join(dataset_root, 'yolo-format')

Convert dataset exported from Labelstudio to YOLO format

In [None]:
import shutil
images_path = os.path.join(dataset_path, 'images')
labels_path = os.path.join(dataset_path, 'labels')

# split data into val and train
train_val_split = 0.8
images_names = os.listdir(images_path)
np.random.shuffle(images_names)
split_index = int(len(images_names) * train_val_split)
train_images_names = images_names[:split_index]
val_images_names = images_names[split_index:]

if os.path.exists(os.path.join(dataset_path, 'train')):
  shutil.rmtree(os.path.join(dataset_path, 'train'))
if os.path.exists(os.path.join(dataset_path, 'val')):
  shutil.rmtree(os.path.join(dataset_path, 'val'))

os.makedirs(os.path.join(dataset_path, 'train', 'images'), exist_ok=True)
os.makedirs(os.path.join(dataset_path, 'train', 'labels'), exist_ok=True)
os.makedirs(os.path.join(dataset_path, 'val', 'images'), exist_ok=True)
os.makedirs(os.path.join(dataset_path, 'val', 'labels'), exist_ok=True)

for image_name in train_images_names:
    shutil.copy(os.path.join(images_path, image_name), os.path.join(dataset_path, 'train', 'images'))
    shutil.copy(os.path.join(labels_path, image_name.replace('.jpg', '.txt')), os.path.join(dataset_path, 'train', 'labels'))

for image_name in val_images_names:
    shutil.copy(os.path.join(images_path, image_name), os.path.join(dataset_path, 'val', 'images'))
    shutil.copy(os.path.join(labels_path, image_name.replace('.jpg', '.txt')), os.path.join(dataset_path, 'val', 'labels'))

# generate config.yaml
names_string = ','.join([f'"{class_name}"' for class_name in localization_classes])
dataset_yaml = f"""
train: ./train/images
val: ./val/images

nc: {len(localization_classes)}
names: [{names_string}]
"""

with open(os.path.join(dataset_path, 'config.yaml'), 'w') as file:
    file.write(dataset_yaml)
  
print('Successfully split data into train and val datasets and generated config.yaml')

In [None]:
from ultralytics import YOLO

model = YOLO('models/yolo11m.pt')
results = model.train(data=os.path.join(dataset_path, 'config.yaml'), epochs=200)
# plt.show()

Load model

In [None]:
from ultralytics import YOLO

model = YOLO('models/yolo5su_fine_tuned.pt')
results = model.val(data=os.path.join(dataset_path, 'config.yaml'))


Show results

In [None]:
data_path = os.path.join(dataset_path, 'images')
for image_name in os.listdir(data_path)[:5]:
    results = model.predict(source=os.path.join(data_path, image_name))
    image = results[0].plot()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis('off')
    plt.show()

Annotate Video with YOLO labels

In [None]:
video_name = 'video.mp4'
video_path = os.path.join(dataset_root, video_name)
output_path = os.path.join(dataset_root, f'{video_name.split(".")[0]}_yolo.mp4')

def process_frame(frame):
    return annotate_frame_with_yolo(model, frame, (512, 512))

annotate_video(video_path, output_path, process_frame)



### Semantic segmentation

Prepare data

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

img_size = 128
train_val_split = 0.8
split_index = int(len(data) * train_val_split)
np.random.shuffle(data)
train_data = data[:split_index]
val_data = data[split_index:]

def images_from_data(data: list[dict]) -> np.ndarray:
    return np.array([cv2.resize(entry['image'], (img_size, img_size)) for entry in data]) / 255.0

def masks_from_data(data: list[dict], channel: int) -> np.ndarray:
    return np.expand_dims(np.array([cv2.resize(entry['segmentation_masks'][channel], (img_size, img_size)) for entry in data]), axis=-1)

# Data augmentation configuration
image_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

mask_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Generator function to yield augmented image and mask pairs
def augment_data(image_batch, mask_batch):
    # Using .flow() to create a data generator for images and masks
    image_gen = image_datagen.flow(image_batch, batch_size=16, seed=42)
    mask_gen = mask_datagen.flow(mask_batch, batch_size=16, seed=42)

    # Combine the two generators
    while True:
        # The `next()` function gives a batch of augmented images and masks
        img_batch = next(image_gen)
        msk_batch = next(mask_gen)

        yield img_batch, msk_batch

# Convert your train data to numpy arrays
train_images, train_road_masks, train_sign_masks = images_from_data(train_data), masks_from_data(train_data, 0), masks_from_data(train_data, 1)
val_images, val_road_masks, val_sign_masks = images_from_data(val_data), masks_from_data(val_data, 0), masks_from_data(val_data, 1)

# Now you can use this augmented data generator during training
train_road_generator = augment_data(train_images, train_road_masks)
train_sign_generator = augment_data(train_images, train_sign_masks)
for i in range(1):
    # Sample: Visualize some augmented images
    sample_img, sample_mask = next(train_road_generator)

    # Displaying one image and its corresponding mask
    plt.subplot(1, 2, 1)
    plt.imshow(sample_img[0])
    plt.axis('off')
    plt.title('Augmented Image')

    plt.subplot(1, 2, 2)
    plt.imshow(sample_mask[0, :, :, 0], cmap='gray')
    plt.axis('off')
    plt.title('Augmented Mask')

    plt.show()

Build model\
*Ended up not using augmentation from above, didn't improve accuracy*\
*The learning curves are extremely unstable, not enough data*

In [None]:
import tensorflow as tf

# weight for traffic signs
class_weight = 0.05
tf.keras.saving.get_custom_objects().clear()

@tf.keras.saving.register_keras_serializable()
def weighted_binary_crossentropy(y_true, y_pred):
    weights = y_true * (1 - class_weight) + (1 - y_true) * class_weight
    return tf.keras.backend.mean(tf.keras.backend.binary_crossentropy(y_true, y_pred) * weights)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

model = models.Sequential()

# Encoder: Convolutional Layers
model.add(layers.Input(shape=(img_size, img_size, 3)))
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.25))
model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.25))

# # Decoder: Upsampling and Convolution
model.add(layers.Conv2DTranspose(128, (3, 3), strides=(2, 2), padding='same'))
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.Dropout(0.25))
model.add(layers.Conv2DTranspose(64, (3, 3), strides=(2, 2), padding='same'))

# Output layer
model.add(layers.Conv2D(1, (1, 1), activation='sigmoid', padding='same'))

model.compile(optimizer=Adam(), loss=weighted_binary_crossentropy, metrics=['accuracy'])
history = model.fit(train_images, train_sign_masks, epochs=150, validation_data=(val_images, val_sign_masks))

Plot Training Results

In [None]:
plt.plot(history.history['accuracy'], color='red', label='Training')
plt.plot(history.history['val_accuracy'], color='blue', label='Validierung')
plt.legend()
plt.xlabel('Epochen')
# plt.ylim(0)
# plt.gca().xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
# plt.savefig('media/Segmentierung_Straßenschild_Training_Diagramm.png')
plt.ylabel('Genauigkeit')

Show Segmentation Results

In [None]:
image = val_images[11]
predicted_masks = model.predict(np.expand_dims(image, axis=0))[0]
predicted_masks = np.transpose(predicted_masks, (2, 0, 1))
fig, ax = plt.subplots()
ax.imshow(image)
show_overlayed_masks(ax, predicted_masks, alpha=1)
ax.axis('off')
plt.show()
plt.imshow(predicted_masks[0], cmap='gray')
# plt.show()
# for row in predicted_masks[0]:
#     print(' '.join([str(round(num, 2)) for num in row]))

# for row in val_masks[0, :, :]:
#     print(' '.join([str(round(num, 2)) for num in row]))

Annotate Video with Segmentations

In [None]:
from tensorflow.keras.saving import load_model
from ultralytics import YOLO

road_model = load_model('models/road_segmentation_97_val_acc.keras')
sign_model = load_model('models/sign_segmentation_97_val_acc.keras', custom_objects={'loss': weighted_binary_crossentropy})
yolo = YOLO('models/yolov5su_fine_tuned.pt')

video_name = 'video1.mp4'
video_path = os.path.join(dataset_root, video_name)
output_path = os.path.join(dataset_root, f'{video_name.split(".")[0]}_segmentations.mp4')

# capture = cv2.VideoCapture(video_path)
# for i in range(1):
#     dims = (512, 512)
#     fig, ax = plt.subplots()
#     image = preprocess(frame, dims)
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     ax.imshow(image)
#     annotate_plot_with_segmentation_models([road_model, sign_model], ax, cv2.resize(image, sign_model.layers[0].output.shape[1:3]) / 255, alpha=0.7)
#     ax.axis('off')
#     plt.savefig('images/temp_frame.png', bbox_inches='tight', pad_inches=0)
#     cv2.cvtColor(cv2.resize(plt.imread('images/temp_frame.png'), dims), cv2.COLOR_BGR2RGB)

def process_frame(frame):
    dims = (512, 512)
    fig, ax = plt.subplots()
    image = annotate_frame_with_yolo(yolo, frame, dims)
    ax.imshow(image)
    annotate_plot_with_segmentation_models([road_model, sign_model], ax, cv2.resize(cv2.cvtColor(preprocess(frame, dims), cv2.COLOR_BGR2RGB), sign_model.layers[0].output.shape[1:3]) / 255, alpha=0.5)
    ax.axis('off')
    plt.savefig('images/temp_frame.png', bbox_inches='tight', pad_inches=0)
    plt.close()
    return cv2.resize(cv2.imread('images/temp_frame.png'), dims)

annotate_video(video_path, output_path, process_frame)
    