# Notebook to Create Data Splits

In [1]:
%load_ext autoreload
%autoreload 2

In [35]:
import pandas as pd
import data_functions as func
from sklearn.model_selection import train_test_split
import os

## Define Constants
Fill in the constants and then run the notebook to create a data split

In [3]:
RANDOM_STATE = 42
DATASET_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.1/expert_data_1.1.csv'
ORIGINAL_IMAGE_FOLDER_PATH = '/exchange/dspro2/M-AI-ZE/data/images'
EXPORT_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.1/splits'
SPLIT_ID = 'SID01'
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
# The test size is defined by 1 - TRAIN_SIZE - VAL_SIZE

NEW_IMG_SIZE = (640, 640)

## Define Methods

In [4]:
def train_val_test_split(df, train_size, val_size):

    val_test_size = round(1 - train_size, 5)
    
    train_df, temp_df = train_test_split(
        df,
        test_size = val_test_size,
        stratify = df['type'],
        random_state = RANDOM_STATE 
    )

    test_size_prop = round((1 / val_test_size) * (val_test_size - val_size), 5)
    
    eval_df, test_df = train_test_split(
        temp_df,
        test_size = test_size_prop,
        stratify = temp_df['type'],
        random_state = RANDOM_STATE
    )
    return train_df, eval_df, test_df
    

In [5]:
def check_type_ratio(train_df, eval_df = None, test_df = None):
    train_rows = train_df.shape[0]
    train_boom = train_df[train_df['type'] == 'boom'].shape[0]
    train_drone = train_df[train_df['type'] == 'drone'].shape[0]
    train_handheld = train_df[train_df['type'] == 'handheld'].shape[0]

    print(f'------TRAIN DATA:------')
    print(f'Boom portion: {(100 / train_rows) * train_boom}%')
    print(f'Drone portion: {(100 / train_rows) * train_drone}%')
    print(f'Handheld portion: {(100 / train_rows) * train_handheld}%')

    if eval_df is not None:
        eval_rows = eval_df.shape[0]
        eval_boom = eval_df[eval_df['type'] == 'boom'].shape[0]
        eval_drone = eval_df[eval_df['type'] == 'drone'].shape[0]
        eval_handheld = eval_df[eval_df['type'] == 'handheld'].shape[0]
    
        print(f'------EVALUATION DATA:------')
        print(f'Boom portion: {(100 / eval_rows) * eval_boom}%')
        print(f'Drone portion: {(100 / eval_rows) * eval_drone}%')
        print(f'Handheld portion: {(100 / eval_rows) * eval_handheld}%')

    if test_df is not None:
        test_rows = test_df.shape[0]
        test_boom = test_df[test_df['type'] == 'boom'].shape[0]
        test_drone = test_df[test_df['type'] == 'drone'].shape[0]
        test_handheld = test_df[test_df['type'] == 'handheld'].shape[0]
    
        print(f'------TEST DATA:------')
        print(f'Boom portion: {(100 / test_rows) * test_boom}%')
        print(f'Drone portion: {(100 / test_rows) * test_drone}%')
        print(f'Handheld portion: {(100 / test_rows) * test_handheld}%')
        
    

## Split and Check Data

In [6]:
bboxes_df = pd.read_csv(DATASET_PATH)
images_df = bboxes_df[['image', 'type']].drop_duplicates()

#### Check original type ratio

In [7]:
check_type_ratio(images_df)

------TRAIN DATA:------
Boom portion: 50.93936491564135%
Drone portion: 41.5530833271937%
Handheld portion: 7.50755175716496%


In [8]:
images_train_df, images_eval_df, images_test_df = train_val_test_split(images_df, TRAIN_SIZE, VAL_SIZE)

#### Check splitted type ratios

In [9]:
check_type_ratio(images_train_df, images_eval_df, images_test_df)

------TRAIN DATA:------
Boom portion: 50.93939952109044%
Drone portion: 41.55461410941241%
Handheld portion: 7.505986369497145%
------EVALUATION DATA:------
Boom portion: 50.92114959469418%
Drone portion: 41.56226971260133%
Handheld portion: 7.516580692704496%
------TEST DATA:------
Boom portion: 50.95729013254787%
Drone portion: 41.531664212076585%
Handheld portion: 7.511045655375552%


### Move Images to Their Folders

In [10]:
image_path_train = EXPORT_PATH + '/' + SPLIT_ID + '/images/train'
image_path_val = EXPORT_PATH + '/' + SPLIT_ID + '/images/val'
image_path_test = EXPORT_PATH + '/' + SPLIT_ID + '/images/test'

In [11]:
os.makedirs(image_path_train, exist_ok=True)
os.makedirs(image_path_val, exist_ok=True)
os.makedirs(image_path_test, exist_ok=True)

In [12]:
func.copy_imgs_to_folder(images_train_df, image_path_train, ORIGINAL_IMAGE_FOLDER_PATH)
func.copy_imgs_to_folder(images_eval_df, image_path_val, ORIGINAL_IMAGE_FOLDER_PATH)
func.copy_imgs_to_folder(images_test_df, image_path_test, ORIGINAL_IMAGE_FOLDER_PATH)

## Normalize Train and Validation Splits

In [13]:
bboxes_train = bboxes_df[bboxes_df["image"].isin(set(images_train_df["image"]))]
bboxes_val = bboxes_df[bboxes_df["image"].isin(set(images_eval_df["image"]))]
bboxes_test = bboxes_df[bboxes_df["image"].isin(set(images_test_df["image"]))]

### Get the Original Sizes of the Images

In [16]:
img_size_dict_train = func.get_image_size_dict(image_path_train)
img_size_dict_val = func.get_image_size_dict(image_path_val)

bboxes_train = func.add_image_size_to_df(bboxes_train, img_size_dict_train)
bboxes_val = func.add_image_size_to_df(bboxes_val, img_size_dict_val)

### Resize the Images

In [17]:
func.resize_images(image_path_train, NEW_IMG_SIZE)
func.resize_images(image_path_val, NEW_IMG_SIZE)

### Normalize Bounding Boxes from 0 to 1 and add the Center Point and Width and Height

In [18]:
bboxes_train = func.prepare_bboxes(bboxes_train)
bboxes_val = func.prepare_bboxes(bboxes_val)

## Safe Labels

In [36]:
label_path_train = EXPORT_PATH + '/' + SPLIT_ID + '/labels/train'
label_path_val = EXPORT_PATH + '/' + SPLIT_ID + '/labels/val'
label_path_test = EXPORT_PATH + '/' + SPLIT_ID + '/labels/test'

In [37]:
os.makedirs(label_path_train, exist_ok=True)
os.makedirs(label_path_val, exist_ok=True)
os.makedirs(label_path_test, exist_ok=True)

### Store Unnormalized Test Data as CSV

In [38]:
bboxes_test.to_csv(label_path_test + '/bboxes_test.csv', index=False)

### Store Train and Validation Labels as .txt Files

In [39]:
func.store_lables_as_txt(bboxes_train, label_path_train)
func.store_lables_as_txt(bboxes_val, label_path_val)