# Notebook to clean and adjust data

In [1]:
import pandas as pd
import os
from PIL import Image

## Constants

In [2]:
EXPERT_BOOM_DATA_PATH = '/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_boom.csv'
EXPERT_DRONE_DATA_PATH = '/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_drone.csv'
EXPORT_HANDHELD_DATA_PATH = '/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_handheld.csv'

BOOM_IMAGE_PATH = '/exchange/dspro2/M-AI-ZE/data/images/images_boom'
DRONE_IMAGE_PATH = '/exchange/dspro2/M-AI-ZE/data/images/images_drone'
HANDHELD_IMAGE_PATH = '/exchange/dspro2/M-AI-ZE/data/images/images_handheld'

BOOM_IMAGE_RESIZE_PATH = '/exchange/dspro2/M-AI-ZE/data/images_resized/images_boom'
DRONE_IMAGE_RESIZE_PATH = '/exchange/dspro2/M-AI-ZE/data/images_resized/images_drone'
HANDHELD_IMAGE_RESIZE_PATH = '/exchange/dspro2/M-AI-ZE/data/images_resized/images_handheld'

EXPORT_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.1/expert_data_1.1.csv'

## Import data

In [3]:
expert_boom_data = pd.read_csv(EXPERT_BOOM_DATA_PATH, delimiter=",", quotechar='"')
expert_drone_data = pd.read_csv(EXPERT_DRONE_DATA_PATH, delimiter=",", quotechar='"')
expert_handheld_data = pd.read_csv(EXPORT_HANDHELD_DATA_PATH, delimiter=",", quotechar='"')

## Methods

In [4]:
def remove_duplicates(df):
    return df.drop_duplicates(inplace=False)

In [5]:
def order_coordinates(df):
    df = df.copy()
    df['x1'], df['x2'] = df[['x1', 'x2']].min(axis=1), df[['x1', 'x2']].max(axis=1)
    df['y1'], df['y2'] = df[['y1', 'y2']].min(axis=1), df[['y1', 'y2']].max(axis=1)
    return df

In [6]:
def remove_no_area_boxes(df):
    df = df.copy()
    return df[(df['x1'] != df['x2']) & (df['y1'] != df['y2'])]

In [7]:
def unify_img_suffix(folder_path):
    rename_extensions = ['.JPG', '.Jpeg']

    for filename in os.listdir(folder_path):
        name, ext = os.path.splitext(filename)

        if ext in rename_extensions:
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, name + '.jpg')
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} -> {name}.jpg')

In [9]:
def unify_img_suffix_df(df):
    df = df.copy()
    df['image'] = df['image'].str.replace(r'\.(jpe?g)$', '.jpg', case=False, regex=True)
    return df

In [10]:
def clip_negative_coord_values(df):
    df = df.copy()
    cols = ['x1', 'y1', 'x2', 'y2']
    df[cols] = df[cols].clip(lower=0)
    return df

## Unify Prefix of the Image Files

In [9]:
unify_img_suffix(BOOM_IMAGE_PATH)
unify_img_suffix(DRONE_IMAGE_PATH)
unify_img_suffix(HANDHELD_IMAGE_PATH)

## Clean CSV Data

In [23]:
clean_boom = remove_duplicates(expert_boom_data)
clean_drone = remove_duplicates(expert_drone_data)
clean_handheld = remove_duplicates(expert_handheld_data)

In [24]:
clean_boom = order_coordinates(clean_boom)
clean_drone = order_coordinates(clean_drone)
clean_handheld = order_coordinates(clean_handheld)

In [25]:
clean_boom = remove_no_area_boxes(clean_boom)
clean_drone = remove_no_area_boxes(clean_drone)
clean_handheld = remove_no_area_boxes(clean_handheld)

In [26]:
clean_drone = unify_img_suffix_df(clean_drone)
clean_handheld = unify_img_suffix_df(clean_handheld)
# Add .jpg suffix for boom images
clean_boom['image'] = clean_boom['image'] + '.jpg'

In [27]:
# Only the boom data contains coordinates with negative values
clean_boom = clip_negative_coord_values(clean_boom)

In [28]:
# Remove entries for not existing images
clean_boom = clean_boom[(clean_boom['image'] != 'DSC06208_3.jpg') & (clean_boom['image'] != 'DSC06209_0.jpg')]

## Combine the Data of the Three Images Types and Export

In [18]:
clean_boom['type'] = 'boom'
clean_drone['type'] = 'drone'
clean_handheld['type'] = 'handheld'
expert_data_combined = pd.concat([clean_boom, clean_drone, clean_handheld], ignore_index=True)

In [19]:
expert_data_combined.to_csv(EXPORT_PATH, index=False)