<a href="https://www.kaggle.com/code/izabelljaro/mapillary-ji?scriptVersionId=157426629" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Mapillary dataset

### new labeling, data augmentation

Source of image augmentation:
https://imgaug.readthedocs.io/en/latest/index.html

In [None]:
import pandas as pd
import os
import numpy as np
import imgaug.augmenters as iaa
import cv2
import matplotlib.pyplot as plt
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

## Filter dataset: prohibitory signs

In [None]:
# Read labels grouped by object
grouped_by_object = pd.read_csv("dataset_object_by_object.csv",sep=",", index_col=0)
grouped_by_object

In [None]:
number_of_all_images = grouped_by_object['name'].value_counts()
number_of_all_images

In [None]:
# Needed classes
class_ids = [
    'no-right-left-or-u-turn',
    'speed-limit',
    'road-closed',
    'no-entry',
    'no-stopping-no-parking',
    'other'
]

In [None]:
# Categories to rename
class_renames = {'regulatory--height-limit--g1': 'other',
                 'regulatory--maximum-speed-limit-5--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-10--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-15--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-20--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-25--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-30--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-40--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-45--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-50--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-60--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-70--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-80--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-90--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-100--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-110--g1': 'speed-limit',
                 'regulatory--maximum-speed-limit-120--g1': 'speed-limit',
                 'regulatory--minimum-safe-distance--g1': 'other',
                 'regulatory--no-bicycles--g1': 'other',
                 'regulatory--no-entry--g1': 'no-entry',
                 'regulatory--no-heavy-goods-vehicles--g1': 'other',
                 'regulatory--no-left-turn--g1': 'no-right-left-or-u-turn',
                 'regulatory--no-left-turn--g2': 'no-right-left-or-u-turn',
                 'regulatory--no-left-turn--g3': 'no-right-left-or-u-turn',
                 'regulatory--no-motorcycles--g1': 'other',
                 'regulatory--no-motorcycles--g2': 'other',
                 'regulatory--no-motor-vehicles-except-motorcycles--g1': 'other',
                 'regulatory--no-motor-vehicles-except-motorcycles--g2': 'other',
                 'regulatory--no-overtaking-by-heavy-goods-vehicles--g1': 'other',
                 'regulatory--no-overtaking--g1': 'other',
                 'regulatory--no-parking--g1': 'no-stopping-no-parking',
                 'regulatory--no-right-turn--g1': 'no-right-left-or-u-turn',
                 'regulatory--no-right-turn--g2': 'no-right-left-or-u-turn',
                 'regulatory--no-right-turn--g3': 'no-right-left-or-u-turn',
                 'regulatory--no-stopping--g15': 'no-stopping-no-parking',
                 'regulatory--no-u-turn--g1': 'no-right-left-or-u-turn',
                 'regulatory--no-u-turn--g2': 'no-right-left-or-u-turn',
                 'regulatory--no-u-turn--g3': 'no-right-left-or-u-turn',
                 'regulatory--road-closed-to-vehicles--g3': 'road-closed',
                 'regulatory--weight-limit--g1': 'other',
                 'regulatory--width-limit--g1': 'other'
                }

In [None]:
# Rename categories
grouped_by_object['class'] = grouped_by_object['class'].replace(class_renames)
grouped_by_object

In [None]:
#Dataframe conntaining only the needed classes
filtered_obj = grouped_by_object[grouped_by_object['class'].isin(class_ids)]
filtered_obj

In [None]:
# Dataset grouped by image
grouped_df = filtered_obj.groupby(['name']).agg({col:lambda x: list(x) for col in filtered_obj.columns[1:]}).reset_index()
grouped_df

In [None]:
# Save dataframe
#grouped_df.to_csv("grouped_by_image_new_classes.csv")

In [None]:
# delete the unnecessary files

image_names_to_keep = grouped_df['name'].tolist() # list of image filenames

folder_paths = ['./train_0/reduced', './train_1/reduced', './train_2/reduced', './val/reduced']

num_deleted_images = 0
for folder_path in folder_paths:
    
    all_files = os.listdir(folder_path)
    files_to_delete = [file for file in all_files if os.path.splitext(file)[0] not in image_names_to_keep]

    
    for file_to_delete in files_to_delete:
        file_path = os.path.join(folder_path, file_to_delete)
        
        os.remove(file_path)
        num_deleted_images +=1
        
        #print(f"Deleted: {file_path}")
        
print('Number of deleted images: ', num_deleted_images)

In [None]:
# Distribution of the classes (all filtered data)

number_of_classes = filtered_obj['class'].value_counts()
number_of_classes

## Train test val split

In [None]:
# calculation og the number of each set

train_ratio = 0.7
test_ratio = 0.2

num_train = int(len(grouped_df) * train_ratio)
num_test = int(len(grouped_df) * test_ratio)
num_val = len(grouped_df) - num_train - num_test

print('number of train images: ',num_train,'\nnumber of test images: ',num_test,'\nnumber of validation images: ',num_val)

In [None]:
filenames = grouped_df['name']
np.random.seed(12)
np.random.shuffle(filenames)

train_set = filenames[:num_train]
test_set = filenames[num_train:num_train + num_test]
val_set =filenames[-num_val:]

#with open('train.txt', 'w') as f_train:
#    f_train.write('\n'.join(train_set))

#with open('test.txt', 'w') as f_test:
#    f_test.write('\n'.join(test_set))

#with open('validation.txt', 'w') as f_val:
#    f_val.write('\n'.join(val_set))

In [None]:
train_set
with open('./mapillary_JI/new_test_train_val_split/train.txt', 'r') as f_train:
    train_set = f_train.read().splitlines()
    
train_set

In [None]:
# filtered_obj_train: train set grouped by objects
# filtered_obj: grouped by object, filtered
# grouped_df: grouped by image, filtered

filtered_obj_train = filtered_obj[filtered_obj['name'].isin(train_set)]
num_of_classes_train = filtered_obj_train['class'].value_counts()
num_of_classes_train

The data augmentation should be done on the train set, and the final number of images of the classes should match the number of elements of the largest class (speed limit ~3600)

In [None]:
filtered_obj_train


In [None]:
grouped_df_train = grouped_df = filtered_obj_train.groupby(['name']).agg({col:lambda x: list(x) for col in filtered_obj_train.columns[1:]}).reset_index()
grouped_df_train

## Data augmentation

In [None]:
# Define the augmentation sequence

augmentation = iaa.Sequential([
    iaa.Fliplr(0.4),  # Horizontal flip with a 40% probability
    
    # Affine transformations
    iaa.Affine(
        rotate=(-10, 10),
        scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
        translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
        shear=(-10, 10)
    ),
    # Gaussian blur with random sigma between 0 and 1.
    # But we only blur about 50% of all images.
    iaa.Sometimes(
        0.5,
        iaa.GaussianBlur(sigma=(0, 1))
    ),
    # Sharpen with 50% probability
    iaa.Sometimes(
        0.5,
        iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5))
    ),
    # change brightness, doesn't affect BBs
    iaa.Multiply((1.2, 1.5)),
    
    # Strengthen or weaken the contrast in each image.
    iaa.LinearContrast((0.75, 1.5))
    
    ])

### 1. Data augmentation on 5 example images

In [None]:
# Apply augmentation to the first 5 rows (example images)

for index, row in filtered_obj.head(5).iterrows():
    
    image_path = f"./mapillary_JI/example_images/{row['name']}.jpg"
    image = cv2.imread(image_path)
    
    # Extract bounding box coordinates
    bbs = BoundingBoxesOnImage([
        BoundingBox(x1=row['xmin'], y1=row['ymin'], x2=row['xmax'], y2=row['ymax'])
    ], shape=image.shape)
    
    # Apply augmentation
    augmented_image, augmented_bboxes = augmentation(image=image, bounding_boxes=bbs)
    
    # Print bounding boxes
    print("Bounding box change: (%.4f, %.4f, %.4f, %.4f) -> (%.4f, %.4f, %.4f, %.4f)" % (
        bbs.bounding_boxes[0].x1, bbs.bounding_boxes[0].y1, bbs.bounding_boxes[0].x2, bbs.bounding_boxes[0].y2,
        augmented_bboxes.bounding_boxes[0].x1, augmented_bboxes.bounding_boxes[0].y1, augmented_bboxes.bounding_boxes[0].x2, augmented_bboxes.bounding_boxes[0].y2)
    )
    
    # image with BBs before/after augmentation
    image_before = bbs.draw_on_image(image, size=2)
    image_after = augmented_bboxes.draw_on_image(augmented_image, size=2, color=[0, 0, 255])

    # Visualize the original and augmented images with bounding boxes
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(cv2.cvtColor(image_before, cv2.COLOR_BGR2RGB))
    plt.title('Original Image with Bounding Box')
    plt.subplot(1, 2, 2)
    plt.imshow(cv2.cvtColor(image_after, cv2.COLOR_BGR2RGB))
    plt.title('Augmented Image with Adjusted Bounding Box')
    plt.show()
    


Hard to spot te bounding box, because of its small size, but in these examples the boxes transformed well.

### Data augmentation of the train set

In [None]:
# Creating one folder from old 4

source_folder1 = './train_0/reduced'
source_folder2 = './train_1/reduced'
source_folder3 = './train_2/reduced'
source_folder4 = './val/reduced'
destination_folder = './all_images'

os.makedirs(destination_folder, exist_ok=True)

# Function to move the contents of a source folder to the destination folder
def move_contents(source_folder, destination_folder):
    for item in os.listdir(source_folder):
        source_item_path = os.path.join(source_folder, item)
        destination_item_path = os.path.join(destination_folder, item)

        if os.path.isdir(source_item_path):
            os.makedirs(destination_item_path, exist_ok=True)
            move_contents(source_item_path, destination_item_path)
            os.rmdir(source_item_path)
        else:
            
            os.rename(source_item_path, destination_item_path)

move_contents(source_folder1, destination_folder)
move_contents(source_folder2, destination_folder)
move_contents(source_folder3, destination_folder)
move_contents(source_folder4, destination_folder)

In [None]:
df = filtered_obj_train
images_dir = './all_images'

output_dir = './augmented_images_train'
os.makedirs(output_dir, exist_ok=True)

class_counts = df['class'].value_counts()

target_objects_per_class = class_counts.max()
print('Target object per class: ',target_objects_per_class)

In [None]:
# New DataFrame to store augmented images
augmented_df = pd.DataFrame(columns=df.columns)

In [None]:
# For now: only for one class (road-closed) to test
for class_name, count in class_counts.items(): # this is for all classes
#class_name='road-closed'
#count = 204
    if class_name != 'road-closed':
        print('class_name: ', class_name )

        # choosing one class
        class_subset = df[df['class'] == class_name]

        # oversampling factor
        oversampling_factor = (target_objects_per_class // count )+1
        class_subset_oversampled = pd.concat([class_subset] * oversampling_factor, ignore_index=True)

        # subset selection
        #print(subset_size)
        subset_size = min(target_objects_per_class, len(class_subset_oversampled))
        class_subset_selected = class_subset_oversampled.sample(subset_size, random_state=42)

        # image augmentation
        for index, row in class_subset_selected.iterrows():
            original_image_path = f"{images_dir}/{row['name']}.jpg"
            #if index == 471:
            if True:

                original_image = cv2.imread(original_image_path)


                # Extract bounding box coordinates
                boundingboxes = BoundingBoxesOnImage([
                    BoundingBox(x1=row['xmin'], y1=row['ymin'], x2=row['xmax'], y2=row['ymax'])
                ], shape=original_image.shape)  

                #print(boundingboxes)

                # Apply augmentation
                augmented_image, augmented_bboxes = augmentation(image=original_image,bounding_boxes=boundingboxes)
                augmented_bboxes=augmented_bboxes.bounding_boxes[0]
                #print(augmented_bboxes)
                #plt.figure()
                #plt.imshow(cv2.cvtColor(augmented_image, cv2.COLOR_BGR2RGB))


                augmented_image_name = f"{os.path.splitext(row['name'])[0]}_{index}_aug.jpg"
                augmented_image_path = os.path.join(output_dir, augmented_image_name)
                #print(augmented_image_path)

                cv2.imwrite(augmented_image_path, augmented_image)


                augmented_row = pd.DataFrame([{
                    'name': f"{os.path.splitext(row['name'])[0]}_{index}_aug",
                    'width': row['width'],  
                    'height': row['height'],  
                    'class': row['class'],  
                    'xmin': augmented_bboxes.x1,
                    'ymin': augmented_bboxes.y1,
                    'xmax': augmented_bboxes.x2,
                    'ymax': augmented_bboxes.y2,
                }])


                augmented_df = pd.concat([augmented_df, augmented_row], ignore_index=True)

            
        
augmented_df
    

In [None]:
# I accidentally augmented for the biggest class, so I need to delete those augmented images
# because they are not necessary
aug_img_to_delete = augmented_df[augmented_df['class'] == 'speed-limit']['name']

#output_dir: aug_images_train

for img_name in aug_img_to_delete:
    path = f"{output_dir}/{img_name}.jpg"
    os.remove(path)

In [None]:
# Now I delete the rows from the dataset
augmented_df = augmented_df[augmented_df['class'] != 'speed-limit']
augmented_df

In [None]:
# Save dataframe
augmented_df.to_csv("./mapillary_JI/grouped_by_object_augmented_otherall.csv")

In [None]:
# Visualization of augmented images

#_CkxBP6-SGo-ZYsPhVUa8w.jpg
image_path = r'./all_images/_CkxBP6-SGo-ZYsPhVUa8w.jpg'
og = cv2.imread(image_path)
plt.imshow(cv2.cvtColor(og, cv2.COLOR_BGR2RGB))
plt.show()

In [None]:
filenames = [filename for filename in os.listdir('./augmented_images_train') if 'CkxBP6-SGo-ZYsPhVUa8w' in filename]
#filenames

In [None]:
fig, axes = plt.subplots(nrows=10, ncols=2,figsize=(10,80))
axes = axes.flatten()
for i, image_filename in enumerate(filenames):
    
    img_path = os.path.join('./augmented_images_train', image_filename)
    img = cv2.imread(img_path)
    
    axes[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    axes[i].axis('off')

axes[i+1].axis('off')
#plt.tight_layout()
plt.show()

In [None]:
# I did the data augmentation in two parts. I read the two saved dataframe, and make one
df1 = pd.read_csv("./mapillary_JI/grouped_by_object_augmented_road_closed.csv",sep=",", index_col=0)
df2 = pd.read_csv("./mapillary_JI/grouped_by_object_augmented_otherall.csv",sep=",", index_col=0)

In [None]:
df1

In [None]:
df2

In [None]:
filtered_obj

In [None]:
# concatenating them
object_aug = pd.concat([df1, df2], ignore_index=True)
object_aug.to_csv("./mapillary_JI/object_aug.csv")
object_all = pd.concat([object_aug, filtered_obj], ignore_index=True)
object_all.to_csv("./mapillary_JI/object_all.csv")

object_all

In [None]:
# Dataset grouped by image (one aug image should contain only one object)
image_aug = object_aug.groupby(['name']).agg({col:lambda x: list(x) for col in object_aug.columns[1:]}).reset_index()
image_all = object_all.groupby(['name']).agg({col:lambda x: list(x) for col in object_all.columns[1:]}).reset_index()

image_aug.to_csv("./mapillary_JI/image_aug.csv")
image_all.to_csv("./mapillary_JI/image_all.csv")


In [None]:
image_aug

In [None]:
image_all