# Notebook to clean and adjust data
Dataset 1.0

In [1]:
import pandas as pd
import os

In [2]:
expert_boom_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_boom.csv", delimiter=",", quotechar='"')
expert_drone_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_drone.csv", delimiter=",", quotechar='"')
expert_handheld_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_handheld.csv", delimiter=",", quotechar='"')

## Methods

In [3]:
def remove_duplicates(df):
    return df.drop_duplicates(inplace=False)

In [4]:
def order_coordinates(df):
    df = df.copy()
    df['x1'], df['x2'] = df[['x1', 'x2']].min(axis=1), df[['x1', 'x2']].max(axis=1)
    df['y1'], df['y2'] = df[['y1', 'y2']].min(axis=1), df[['y1', 'y2']].max(axis=1)
    return df

In [5]:
def remove_no_area_boxes(df):
    df = df.copy()
    return df[(df['x1'] != df['x2']) & (df['y1'] != df['y2'])]

In [6]:
def unify_img_suffix(folder_path):
    rename_extensions = ['.JPG', '.Jpeg']

    for filename in os.listdir(folder_path):
        name, ext = os.path.splitext(filename)

        if ext in rename_extensions:
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, name + '.jpg')
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} -> {name}.jpg')

In [7]:
def unify_img_suffix_df(df):
    df = df.copy()
    df['image'] = df['image'].str.replace(r'\.(jpe?g)$', '.jpg', case=False, regex=True)
    return df

In [8]:
def clip_negative_coord_values(df):
    df = df.copy()
    cols = ['x1', 'y1', 'x2', 'y2']
    df[cols] = df[cols].clip(lower=0)
    return df

## Unify Prefix of the Image Files

In [9]:
unify_img_suffix('/exchange/dspro2/M-AI-ZE/data/images/images_boom')
unify_img_suffix('/exchange/dspro2/M-AI-ZE/data/images/images_drone')
unify_img_suffix('/exchange/dspro2/M-AI-ZE/data/images/images_handheld')

## Clean CSV Data

In [10]:
clean_boom = remove_duplicates(expert_boom_data)
clean_drone = remove_duplicates(expert_drone_data)
clean_handheld = remove_duplicates(expert_handheld_data)

In [11]:
clean_boom = order_coordinates(clean_boom)
clean_drone = order_coordinates(clean_drone)
clean_handheld = order_coordinates(clean_handheld)

In [12]:
clean_boom = remove_no_area_boxes(clean_boom)
clean_drone = remove_no_area_boxes(clean_drone)
clean_handheld = remove_no_area_boxes(clean_handheld)

In [13]:
clean_drone = unify_img_suffix_df(clean_drone)
clean_handheld = unify_img_suffix_df(clean_handheld)
# Add .jpg suffix for boom images
clean_boom['image'] = clean_boom['image'] + '.jpg'

In [14]:
# Only the boom data contains coordinates with negative values
clean_boom = clip_negative_coord_values(clean_boom)

In [15]:
clean_boom.head()

Unnamed: 0,image,x1,y1,x2,y2,user,day,month,year,hour,minute
4,DSC00965_0.jpg,2503,161,2515,324,Ethan,5,7,2016,17,59
5,DSC00965_1.jpg,1954,1779,2000,1925,Tyr,8,12,2016,16,9
6,DSC00965_1.jpg,450,1757,569,1833,Tyr,8,12,2016,16,9
7,DSC00965_1.jpg,2151,1394,2410,1402,Tyr,8,12,2016,16,9
8,DSC00965_1.jpg,2757,1485,2849,1593,Tyr,8,12,2016,16,9


In [16]:
clean_drone.head()

Unnamed: 0,image,x1,y1,x2,y2,user,day,month,year,hour,minute
0,J_170823_134140.jpg,1518,0,1795,415,Tyr,26,1,2018,10,41
1,J_170823_134140.jpg,2157,0,2375,549,Tyr,26,1,2018,10,41
2,J_170823_134140.jpg,1166,0,1449,389,Tyr,26,1,2018,10,41
3,J_170823_134140.jpg,3217,3755,3622,4000,Tyr,26,1,2018,10,41
4,J_170823_134140.jpg,3659,2242,3989,2530,Tyr,26,1,2018,10,41


In [17]:
clean_handheld.head()

Unnamed: 0,image,x1,y1,x2,y2,user,day,month,year,hour,minute
0,DSC00025.jpg,1864,1648,2864,2064,Tyr,26,5,2016,16,48
1,DSC00026.jpg,2872,1416,3152,2016,Tyr,26,5,2016,16,48
3,DSC00028.jpg,3248,1540,3572,2048,Tyr,26,5,2016,16,48
4,DSC00028.jpg,3444,1916,3852,2116,Tyr,26,5,2016,16,48
7,DSC00031.jpg,2016,1352,2968,1760,Tyr,26,5,2016,16,48


## Combine the Data of the Three Images Types and Export

In [18]:
clean_boom['type'] = 'boom'
clean_drone['type'] = 'drone'
clean_handheld['type'] = 'handheld'
expert_data_combined = pd.concat([clean_boom, clean_drone, clean_handheld], ignore_index=True)

In [19]:
expert_data_combined.to_csv('/exchange/dspro2/M-AI-ZE/data/adjusted/1.0/expert_data_1.0.csv', index=False)