# Notebook to Create Data Splits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import data_functions as func
from sklearn.model_selection import train_test_split
import os

## Define Constants
Fill in the constants and then run the notebook to create a data split

In [3]:
RANDOM_STATE = 42
DATASET_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.1/expert_data_1.1.csv'
ORIGINAL_IMAGE_FOLDER_PATH = '/exchange/dspro2/M-AI-ZE/data/images'
EXPORT_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.1/splits'
SPLIT_ID = 'SID03'
TRAIN_SIZE = 0.8
VAL_SIZE = 0
USE_VALIDATION = False
# The test size is defined by 1 - TRAIN_SIZE - VAL_SIZE

NEW_IMG_SIZE = (640, 640)

## Split and Check Data

In [4]:
bboxes_df = pd.read_csv(DATASET_PATH)
images_df = bboxes_df[['image', 'type']].drop_duplicates()

#### Check original type ratio

In [5]:
func.check_type_ratio(images_df)

------TRAIN DATA:------
Boom portion: 50.93936491564135%
Drone portion: 41.5530833271937%
Handheld portion: 7.50755175716496%


In [6]:
if USE_VALIDATION:
    images_train_df, images_eval_df, images_test_df = func.train_val_test_split(images_df, TRAIN_SIZE, VAL_SIZE, RANDOM_STATE, USE_VALIDATION)
else: 
    images_train_df, images_test_df = func.train_val_test_split(images_df, TRAIN_SIZE, VAL_SIZE, RANDOM_STATE, USE_VALIDATION)

#### Check splitted type ratios

In [7]:
if USE_VALIDATION:
    func.check_type_ratio(images_train_df, images_eval_df, images_test_df)
else:
    func.check_type_ratio(train_df=images_train_df, test_df=images_test_df)

------TRAIN DATA:------
Boom portion: 50.93939952109044%
Drone portion: 41.55461410941241%
Handheld portion: 7.505986369497145%
------TEST DATA:------
Boom portion: 50.93922651933702%
Drone portion: 41.546961325966855%
Handheld portion: 7.513812154696133%


### Move Images to Their Folders

In [8]:
image_path_train = EXPORT_PATH + '/' + SPLIT_ID + '/images/train'

if USE_VALIDATION:
    image_path_val = EXPORT_PATH + '/' + SPLIT_ID + '/images/val'
    
image_path_test = EXPORT_PATH + '/' + SPLIT_ID + '/images/test'

In [9]:
os.makedirs(image_path_train, exist_ok=True)
if USE_VALIDATION:
    os.makedirs(image_path_val, exist_ok=True)
os.makedirs(image_path_test, exist_ok=True)

In [10]:
func.copy_imgs_to_folder(images_train_df, image_path_train, ORIGINAL_IMAGE_FOLDER_PATH)
if USE_VALIDATION:
    func.copy_imgs_to_folder(images_eval_df, image_path_val, ORIGINAL_IMAGE_FOLDER_PATH)
func.copy_imgs_to_folder(images_test_df, image_path_test, ORIGINAL_IMAGE_FOLDER_PATH)

## Normalize Train and Validation Splits

In [11]:
bboxes_train = bboxes_df[bboxes_df["image"].isin(set(images_train_df["image"]))]

if USE_VALIDATION:
    bboxes_val = bboxes_df[bboxes_df["image"].isin(set(images_eval_df["image"]))]
    
bboxes_test = bboxes_df[bboxes_df["image"].isin(set(images_test_df["image"]))]

### Get the Original Sizes of the Images

In [12]:
img_size_dict_train = func.get_image_size_dict(image_path_train)
if USE_VALIDATION:
    img_size_dict_val = func.get_image_size_dict(image_path_val)

bboxes_train = func.add_image_size_to_df(bboxes_train, img_size_dict_train)
if USE_VALIDATION:
    bboxes_val = func.add_image_size_to_df(bboxes_val, img_size_dict_val)

### Resize the Images

In [13]:
#func.resize_images(image_path_train, NEW_IMG_SIZE)
#if USE_VALIDATION:
    #func.resize_images(image_path_val, NEW_IMG_SIZE)

### Normalize Bounding Boxes from 0 to 1 and add the Center Point and Width and Height

In [14]:
bboxes_train = func.prepare_bboxes(bboxes_train)
if USE_VALIDATION:
    bboxes_val = func.prepare_bboxes(bboxes_val)

## Safe Labels

In [15]:
label_path_train_boom = EXPORT_PATH + '/' + SPLIT_ID + '/labels/train/boom'
label_path_train_drone = EXPORT_PATH + '/' + SPLIT_ID + '/labels/train/drone'
label_path_train_handheld = EXPORT_PATH + '/' + SPLIT_ID + '/labels/train/handheld'

if USE_VALIDATION:
    label_path_val_boom = EXPORT_PATH + '/' + SPLIT_ID + '/labels/val/boom'
    label_path_val_drone = EXPORT_PATH + '/' + SPLIT_ID + '/labels/val/drone'
    label_path_val_handheld = EXPORT_PATH + '/' + SPLIT_ID + '/labels/val/handheld'

label_path_test = EXPORT_PATH + '/' + SPLIT_ID + '/labels/test'


In [16]:
os.makedirs(label_path_train_boom, exist_ok=True)
os.makedirs(label_path_train_drone, exist_ok=True)
os.makedirs(label_path_train_handheld, exist_ok=True)

if USE_VALIDATION:
    os.makedirs(label_path_val_boom, exist_ok=True)
    os.makedirs(label_path_val_drone, exist_ok=True)
    os.makedirs(label_path_val_handheld, exist_ok=True)

os.makedirs(label_path_test, exist_ok=True)

### Store Unnormalized Test Data as CSV

In [17]:
bboxes_test.to_csv(label_path_test + '/bboxes_test.csv', index=False)

### Store Train and Validation Labels as .txt Files

In [18]:
func.store_lables_as_txt(bboxes_train[bboxes_train['type'] == 'boom'], label_path_train_boom)
func.store_lables_as_txt(bboxes_train[bboxes_train['type'] == 'drone'], label_path_train_drone)
func.store_lables_as_txt(bboxes_train[bboxes_train['type'] == 'handheld'], label_path_train_handheld)

if USE_VALIDATION:
    func.store_lables_as_txt(bboxes_val[bboxes_val['type'] == 'boom'], label_path_val_boom)
    func.store_lables_as_txt(bboxes_val[bboxes_val['type'] == 'drone'], label_path_val_drone)
    func.store_lables_as_txt(bboxes_val[bboxes_val['type'] == 'handheld'], label_path_val_handheld)