<a href="https://colab.research.google.com/github/ericae9/Autonomous-Vehicle-Object-Detection/blob/main/CV_Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the code to process the [Berkeley DeepDrive Dataset](https://arxiv.org/abs/1805.04687), splitting the dataset into training, validation, and test datasets, and formatting the labels for the [PyTorch Faster-RCNN](http://pytorch.org/vision/stable/models.html#object-detection-instance-segmentation-and-person-keypoint-detection) and [YOLOv4](https://github.com/AlexeyAB/darknet) models.

In [None]:
import glob
import pickle
import random
import torch
import json
from PIL import Image
import os
import shutil
from shutil import copyfile
import time

Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Split the data into train, validation, and test

In [None]:
# Edit the file path below to go to the location of the images.
image_file_path = '/content/drive/MyDrive/CV_Project/images/'

In [None]:
all_images = []
for image_name in glob.iglob(image_file_path + '*'):
    all_images.append(image_name)
random.shuffle(all_images)
num_training_images = int(len(all_images) * 0.7)
num_val_images = int(len(all_images) * 0.1)
training_images = all_images[:num_training_images]
val_images = all_images[num_training_images:num_training_images + num_val_images]
test_images = all_images[num_training_images + num_val_images:]

## Move images into subfolders to help prevent a Google Drive Timeout Error, which can occur when you try to open a file, in this case an image, in a folder that contains many files

In [None]:
def create_image_folders(image_dir, images, data_split):
    """
    Copies the images in the given directory to subfolders,
    placing about 1000 images in each subfolder.

    Args:
      image_dir: Name of top folder, which will contain subfolders with images.
      images: List of current locations of the images to copy.
      data_split: train, val, or test, depending on what dataset the given
      images are in.
    
    Returns:
      Dictionary where the key is the original image location and the value
      is a tuple of the form (new image location, data_split).
    """
    image_num = 0
    cur_dir_num = 0
    os.mkdir(image_dir + str(cur_dir_num))
    image_locations = dict()
    for image in images:
        if int(image_num / 1000) != cur_dir_num:
            cur_dir_num = int(image_num / 1000)
            os.mkdir(image_dir + str(cur_dir_num))
        new_dir = image_dir + str(cur_dir_num) + '/'
        image_locations[image] = (new_dir, data_split)
        file_name = os.path.basename(image)
        copyfile(image, new_dir + file_name)
        image_num += 1
    return image_locations

In [None]:
# Edit the file paths below to go to the folders the train, validation, and test
# images should be saved in.
all_image_locations = create_image_folders('/content/drive/MyDrive/CV_Project/training_images/train', training_images, 'train')
all_image_locations.update(create_image_folders('/content/drive/MyDrive/CV_Project/val_images/val', val_images, 'val'))
all_image_locations.update(create_image_folders('/content/drive/MyDrive/CV_Project/test_images/test', test_images, 'test'))

## Get a list of object classes (categories)

In [None]:
categories = []
# Edit the file path below to go to the original labels file for the training set.
with open('/content/drive/MyDrive/CV_Project/det_train.json', 'rb') as original_train_labels_file:
    original_train_labels = json.load(original_train_labels_file)
for index, image_obj in enumerate(original_train_labels):
    if 'labels' not in image_obj:
        continue
    else:
        for bounding_box in image_obj['labels']:
            categories.append(bounding_box['category'])

In [None]:
# Edit the file path below to go to the original labels file for the validation set.
with open('/content/drive/MyDrive/CV_Project/det_val.json', 'rb') as original_val_labels_file:
    original_val_labels = json.load(original_val_labels_file)
for image_obj in original_val_labels:
    if 'labels' not in image_obj:
        continue
    else:
        for bounding_box in image_obj['labels']:
            categories.append(bounding_box['category'])

In [None]:
set(categories)

## Format the labels for Faster R-CNN

### Create a dictionary with class label as the key and index as the value

In [None]:
label_to_index = {
    'background': 0,
    'pedestrian': 1,
    'rider': 2,
    'car': 3,
    'truck': 4,
    'bus': 5,
    'train': 6,
    'motorcycle': 7,
    'bicycle': 8,
    'traffic light': 9,
    'traffic sign': 10,
    'other person': 11,
    'other vehicle': 12,
    'trailer': 13
}

### Create a dictionary with image name as the key and (image location, data split) as the value

In [None]:
all_image_locations = dict()

In [None]:
# Edit the file path below to go to the folder containing the training set.
for train_image_folder in glob.iglob('/content/drive/MyDrive/CV_Project/training_images/*'):
    print('On folder:', train_image_folder)
    for train_image in glob.iglob(train_image_folder + '/*'):
        all_image_locations[os.path.basename(train_image)] = (train_image, 'train')

In [None]:
# Edit the file path below to go to the folder containing the validation set.
for val_image_folder in glob.iglob('/content/drive/MyDrive/CV_Project/val_images/*'):
    print('On folder:', val_image_folder)
    for val_image in glob.iglob(val_image_folder + '/*'):
        all_image_locations[os.path.basename(val_image)] = (val_image, 'val')

In [None]:
# Edit the file path below to go to the folder containing the test set.
for test_image_folder in glob.iglob('/content/drive/MyDrive/CV_Project/test_images/*'):
    print('On folder:', test_image_folder)
    for test_image in glob.iglob(test_image_folder + '/*'):
        all_image_locations[os.path.basename(test_image)] = (test_image, 'test')

### Format the labels

In [None]:
train_labels = []
val_labels = []
test_labels = []
image_id = 0
# Edit the file path below to go to the original labels file for the training set.
with open('/content/drive/MyDrive/CV_Project/det_train.json', 'rb') as original_train_labels_file:
    original_train_labels = json.load(original_train_labels_file)
for image_obj in original_train_labels:
    cur_label = {
        'image_id': torch.tensor([image_id])
    }
    boxes = []
    areas = []
    new_image_location, image_data_split = all_image_locations[image_obj['name']]
    if 'labels' in image_obj:
        cur_label['iscrowd'] = torch.zeros((len(image_obj['labels']),), dtype=torch.int64)
        cur_label['labels'] = torch.zeros((len(image_obj['labels']),), dtype=torch.int64)
        for cur_index, bounding_box in enumerate(image_obj['labels']):
            boxes.append([bounding_box['box2d']['x1'], bounding_box['box2d']['y1'],
                          bounding_box['box2d']['x2'], bounding_box['box2d']['y2']])
            cur_area = (bounding_box['box2d']['x2'] - bounding_box['box2d']['x1']) * (bounding_box['box2d']['y2'] - bounding_box['box2d']['y1'])
            areas.append(cur_area)
            cur_label['labels'][cur_index] = label_to_index[bounding_box['category']]
    else:
        cur_label['iscrowd'] = torch.zeros((1,), dtype=torch.int64)
        cur_label['labels'] = torch.tensor([label_to_index['background']], dtype=torch.int64)
        cur_image = Image.open(new_image_location)
        image_width, image_height = cur_image.size
        boxes.append([0.5, 0.5, image_width - 0.5, image_height - 0.5])
        areas.append(image_width * image_height)
    cur_label['boxes'] = torch.tensor(boxes)
    cur_label['area'] = torch.tensor(areas)
    if image_data_split == 'train':
        train_labels.append((new_image_location, cur_label))
    elif image_data_split == 'val':
        val_labels.append((new_image_location, cur_label))
    else:
        test_labels.append((new_image_location, cur_label))
    image_id += 1

In [None]:
# Edit the file path below to go to the original labels file for the validation set.
with open('/content/drive/MyDrive/CV_Project/det_val.json', 'rb') as original_val_labels_file:
    original_val_labels = json.load(original_val_labels_file)
for image_obj in original_val_labels:
    cur_label = {
        'image_id': torch.tensor([image_id])
    }
    boxes = []
    areas = []
    new_image_location, image_data_split = all_image_locations[image_obj['name']]
    if 'labels' in image_obj:
        cur_label['iscrowd'] = torch.zeros((len(image_obj['labels']),), dtype=torch.int64)
        cur_label['labels'] = torch.zeros((len(image_obj['labels']),), dtype=torch.int64)
        for cur_index, bounding_box in enumerate(image_obj['labels']):
            boxes.append([bounding_box['box2d']['x1'], bounding_box['box2d']['y1'],
                          bounding_box['box2d']['x2'], bounding_box['box2d']['y2']])
            cur_area = (bounding_box['box2d']['x2'] - bounding_box['box2d']['x1']) * (bounding_box['box2d']['y2'] - bounding_box['box2d']['y1'])
            areas.append(cur_area)
            cur_label['labels'][cur_index] = label_to_index[bounding_box['category']]
    else:
        cur_label['iscrowd'] = torch.zeros((1,), dtype=torch.int64)
        cur_label['labels'] = torch.tensor([label_to_index['background']], dtype=torch.int64)
        cur_image = Image.open(new_image_location)
        image_width, image_height = cur_image.size
        boxes.append([0.5, 0.5, image_width - 0.5, image_height - 0.5])
        areas.append(image_width * image_height)
    cur_label['boxes'] = torch.tensor(boxes)
    cur_label['area'] = torch.tensor(areas)
    if image_data_split == 'train':
        train_labels.append((new_image_location, cur_label))
    elif image_data_split == 'val':
        val_labels.append((new_image_location, cur_label))
    else:
        test_labels.append((new_image_location, cur_label))
    image_id += 1

In [None]:
faster_rcnn_labels = {
    'train_labels': train_labels,
    'val_labels': val_labels,
    'test_labels': test_labels
}
# Edit the file path below to go to the location where the labels for Faster
# R-CNN should be stored.
pickle.dump(faster_rcnn_labels, open('/content/drive/MyDrive/CV_Project/faster_rcnn_labels.p', 'wb'))

## Format the labels for YOLO

### Create a dictionary with class label as the key and index as the value

In [None]:
label_to_index = {
    'pedestrian': 0,
    'rider': 1,
    'car': 2,
    'truck': 3,
    'bus': 4,
    'train': 5,
    'motorcycle': 6,
    'bicycle': 7,
    'traffic light': 8,
    'traffic sign': 9,
    'other person': 10,
    'other vehicle': 11,
    'trailer': 12
}

### Create a dictionary with image name as the key and (image location, data split) as the value

In [None]:
# Edit the file path below to go the build/darknet/x64/data folder in the
# darknet repository. Before running this cell, clone the darknet repo
# from https://github.com/AlexeyAB/darknet
darknet_path = '/content/drive/MyDrive/CV_Project/darknet/build/darknet/x64/data/'
all_image_locations = dict()

In [None]:
for train_image_folder in glob.iglob(darknet_path + 'obj/training_images/*'):
    print('On folder:', train_image_folder)
    for train_image in glob.iglob(train_image_folder + '/*.jpg'):
        image_name = os.path.basename(train_image)
        all_image_locations[image_name] = (train_image, 'train')

In [None]:
for val_image_folder in glob.iglob(darknet_path + 'obj/val_images/*'):
    print('On folder:', val_image_folder)
    for val_image in glob.iglob(val_image_folder + '/*.jpg'):
        image_name = os.path.basename(val_image)
        all_image_locations[image_name] = (val_image, 'val')

In [None]:
for test_image_folder in glob.iglob(darknet_path + 'obj/test_images/*'):
    print('On folder:', test_image_folder)
    for test_image in glob.iglob(test_image_folder + '/*.jpg'):
        image_name = os.path.basename(test_image)
        all_image_locations[image_name] = (test_image, 'test')

### Create a txt file for each image, containing the bounding box information and corresponding class labels

In [None]:
train_image_locations = []
val_image_locations = []
test_image_locations = []

In [None]:
# Edit the file path below to go to the file path for any image in the training,
# validation, or test set.
with Image.open('/content/drive/MyDrive/CV_Project/darknet/build/darknet/x64/data/obj/training_images/train12/0ac3cbf4-73c76d25.jpg') as cur_image:
    image_width, image_height = cur_image.size

In [None]:
# Edit the file path below to go to the original labels file for the training set.
with open('/content/drive/MyDrive/CV_Project/det_train.json', 'rb') as original_train_labels_file:
    original_train_labels = json.load(original_train_labels_file)
image_num = 0
already_done = 0
for image_obj in original_train_labels:
    image_num += 1
    if image_num % 1000 == 0:
        print('Completed', image_num, 'images')
    if image_obj['name'] not in all_image_locations:
        continue
    image_location, data_split = all_image_locations[image_obj['name']]
    cur_file_name = image_location[:image_location.index('.')] + '.txt'
    if data_split == 'train':
        train_image_locations.append(image_location)
    elif data_split == 'val':
        val_image_locations.append(image_location)
    else:
        test_image_locations.append(image_location)
    if os.path.isfile(cur_file_name):
        already_done += 1
        continue
    time.sleep(0.05)
    if 'labels' in image_obj:
        boxes = []
        for cur_index, bounding_box in enumerate(image_obj['labels']):
            cur_label = label_to_index[bounding_box['category']]
            box_width = bounding_box['box2d']['x2'] - bounding_box['box2d']['x1']
            box_height = bounding_box['box2d']['y2'] - bounding_box['box2d']['y1']
            x_center_abs = bounding_box['box2d']['x1'] + (box_width / 2)
            x_center_rel = x_center_abs / image_width
            y_center_abs = bounding_box['box2d']['y1'] + (box_height / 2)
            y_center_rel = y_center_abs / image_height
            rel_width = box_width / image_width
            rel_height = box_height / image_height
            cur_box = [str(cur_label), str(x_center_rel), str(y_center_rel), str(rel_width), str(rel_height)]
            boxes.append(' '.join(cur_box))
        cur_file_contents = '\n'.join(boxes)
    else:
        cur_file_contents = ''
    with open(cur_file_name, 'w') as cur_image_file:
        cur_image_file.write(cur_file_contents)

In [None]:
# Edit the file path below to go to the original labels file for the validation set.
with open('/content/drive/MyDrive/CV_Project/det_val.json', 'rb') as original_val_labels_file:
    original_val_labels = json.load(original_val_labels_file)
image_num = 0
for image_obj in original_val_labels:
    image_num += 1
    if image_num % 1000 == 0:
        print('Completed', image_num, 'images')
    if image_obj['name'] not in all_image_locations:
        continue
    image_location, data_split = all_image_locations[image_obj['name']]
    cur_file_name = image_location[:image_location.index('.')] + '.txt'
    if data_split == 'train':
        train_image_locations.append(image_location)
    elif data_split == 'val':
        val_image_locations.append(image_location)
    else:
        test_image_locations.append(image_location)
    if os.path.isfile(cur_file_name):
        already_done += 1
        continue
    time.sleep(0.05)
    if 'labels' in image_obj:
        boxes = []
        for cur_index, bounding_box in enumerate(image_obj['labels']):
            cur_label = label_to_index[bounding_box['category']]
            box_width = bounding_box['box2d']['x2'] - bounding_box['box2d']['x1']
            box_height = bounding_box['box2d']['y2'] - bounding_box['box2d']['y1']
            x_center_abs = bounding_box['box2d']['x1'] + (box_width / 2)
            x_center_rel = x_center_abs / image_width
            y_center_abs = bounding_box['box2d']['y1'] + (box_height / 2)
            y_center_rel = y_center_abs / image_height
            rel_width = box_width / image_width
            rel_height = box_height / image_height
            cur_box = [str(cur_label), str(x_center_rel), str(y_center_rel), str(rel_width), str(rel_height)]
            boxes.append(' '.join(cur_box))
        cur_file_contents = '\n'.join(boxes)
    else:
        cur_file_contents = ''
    with open(cur_file_name, 'w') as cur_image_file:
        cur_image_file.write(cur_file_contents)

In [None]:
print('Number of txt files already created:', already_done)

In [None]:
# Create train.txt with the location of each image in the training set.
with open(darknet_path + 'train.txt', 'w') as train_file:
    train_file_contents = '\n'.join(train_image_locations)
    train_file.write(train_file_contents)

In [None]:
# Create val.txt with the location of each image in the validation set.
with open(darknet_path + 'val.txt', 'w') as val_file:
    val_file_contents = '\n'.join(val_image_locations)
    val_file.write(val_file_contents)

In [None]:
# Create test.txt with the location of each image in the test set.
with open(darknet_path + 'test.txt', 'w') as test_file:
    test_file_contents = '\n'.join(test_image_locations)
    test_file.write(test_file_contents)

## Get dataset info

In [None]:
num_objects = 0
num_images = 0
# For each class, the number of times an object from that class
# appears in an image.
num_objects_per_class = {
    'pedestrian': 0,
    'rider': 0,
    'car': 0,
    'truck': 0,
    'bus': 0,
    'train': 0,
    'motorcycle': 0,
    'bicycle': 0,
    'traffic light': 0,
    'traffic sign': 0,
    'other person': 0,
    'other vehicle': 0,
    'trailer': 0
}
# For each class, the number of images that contain at least one object
# from that class.
num_images_with_object_class = {
    'none': 0,
    'pedestrian': 0,
    'rider': 0,
    'car': 0,
    'truck': 0,
    'bus': 0,
    'train': 0,
    'motorcycle': 0,
    'bicycle': 0,
    'traffic light': 0,
    'traffic sign': 0,
    'other person': 0,
    'other vehicle': 0,
    'trailer': 0
}

In [None]:
# Edit the file path below to go to the original labels file for the training set.
with open('/content/drive/MyDrive/CV_Project/det_train.json', 'rb') as original_train_labels_file:
    original_train_labels = json.load(original_train_labels_file)
for image_obj in original_train_labels:
    num_images += 1
    if 'labels' not in image_obj:
        num_images_with_object_class['none'] += 1
    else:
        cur_box_classes = [box['category'] for box in image_obj['labels']]
        num_objects += len(cur_box_classes)
        for cur_class in cur_box_classes:
            num_objects_per_class[cur_class] += 1
        cur_box_class_set = set(cur_box_classes)
        for cur_class in cur_box_class_set:
            num_images_with_object_class[cur_class] += 1

In [None]:
# Edit the file path below to go to the original labels file for the validation set.
with open('/content/drive/MyDrive/CV_Project/det_val.json', 'rb') as original_val_labels_file:
    original_val_labels = json.load(original_val_labels_file)
for image_obj in original_val_labels:
    num_images += 1
    if 'labels' not in image_obj:
        num_images_with_object_class['none'] += 1
    else:
        cur_box_classes = [box['category'] for box in image_obj['labels']]
        num_objects += len(cur_box_classes)
        for cur_class in cur_box_classes:
            num_objects_per_class[cur_class] += 1
        cur_box_class_set = set(cur_box_classes)
        for cur_class in cur_box_class_set:
            num_images_with_object_class[cur_class] += 1

In [None]:
print('Total number of images:', num_images)
print('Total number of objects:', num_objects)

In [None]:
avg_objs_per_image = round(num_objects / num_images)
print('Average number of objects per image:', avg_objs_per_image)

In [None]:
print('Total number of objects for each class:')
for obj_class in num_objects_per_class:
    print(obj_class, ':', num_objects_per_class[obj_class])

In [None]:
print('For each class, number of images that contain at least one object from that class:')
for obj_class in num_images_with_object_class:
    print(obj_class, ':', num_images_with_object_class[obj_class])