# Notebook to Create Data Splits

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Define Constants
Fill in the constants and then run the notebook to create data split

In [2]:
RANDOM_STATE = 42
DATASET_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.0/expert_data_1.0.csv'
EXPORT_PATH = '/exchange/dspro2/M-AI-ZE/data/adjusted/1.0/splits'
SPLIT_ID = 'SID01'
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
# The test size is defined by 1 - TRAIN_SIZE - VAL_SIZE

## Define Methods

In [3]:
def train_val_test_split(df, train_size, val_size):

    val_test_size = round(1 - train_size, 5)
    
    train_df, temp_df = train_test_split(
        df,
        test_size = val_test_size,
        stratify = df['type'],
        random_state = RANDOM_STATE 
    )

    test_size_prop = round((1 / val_test_size) * (val_test_size - val_size), 5)
    
    eval_df, test_df = train_test_split(
        temp_df,
        test_size=test_size_prop,
        stratify=temp_df['type'],
        random_state= RANDOM_STATE
    )
    return train_df, eval_df, test_df
    

In [4]:
def check_type_ratio(train_df, eval_df = None, test_df = None):
    train_rows = train_df.shape[0]
    train_boom = train_df[train_df['type'] == 'boom'].shape[0]
    train_drone = train_df[train_df['type'] == 'drone'].shape[0]
    train_handheld = train_df[train_df['type'] == 'handheld'].shape[0]

    print(f'------TRAIN DATA:------')
    print(f'Boom portion: {(100 / train_rows) * train_boom}%')
    print(f'Drone portion: {(100 / train_rows) * train_drone}%')
    print(f'Handheld portion: {(100 / train_rows) * train_handheld}%')

    if eval_df is not None:
        eval_rows = eval_df.shape[0]
        eval_boom = eval_df[eval_df['type'] == 'boom'].shape[0]
        eval_drone = eval_df[eval_df['type'] == 'drone'].shape[0]
        eval_handheld = eval_df[eval_df['type'] == 'handheld'].shape[0]
    
        print(f'------EVALUATION DATA:------')
        print(f'Boom portion: {(100 / eval_rows) * eval_boom}%')
        print(f'Drone portion: {(100 / eval_rows) * eval_drone}%')
        print(f'Handheld portion: {(100 / eval_rows) * eval_handheld}%')

    if test_df is not None:
        test_rows = test_df.shape[0]
        test_boom = test_df[test_df['type'] == 'boom'].shape[0]
        test_drone = test_df[test_df['type'] == 'drone'].shape[0]
        test_handheld = test_df[test_df['type'] == 'handheld'].shape[0]
    
        print(f'------TEST DATA:------')
        print(f'Boom portion: {(100 / test_rows) * test_boom}%')
        print(f'Drone portion: {(100 / test_rows) * test_drone}%')
        print(f'Handheld portion: {(100 / test_rows) * test_handheld}%')
        
    

## Split and Check Data

In [5]:
data_set = pd.read_csv(DATASET_PATH)

#### Check original type ratio

In [6]:
check_type_ratio(data_set)

------TRAIN DATA:------
Boom portion: 53.37677370427286%
Drone portion: 39.74394795138404%
Handheld portion: 6.879278344343105%


In [7]:
train_df, eval_df, test_df = train_val_test_split(data_set, TRAIN_SIZE, VAL_SIZE)

#### Check splitted type ratios

In [8]:
check_type_ratio(train_df, eval_df, test_df)

------TRAIN DATA:------
Boom portion: 53.37715632834816%
Drone portion: 39.7437976325604%
Handheld portion: 6.879046039091442%
------EVALUATION DATA:------
Boom portion: 53.377906396567205%
Drone portion: 39.74653228220736%
Handheld portion: 6.875561321225427%
------TEST DATA:------
Boom portion: 53.372580323288766%
Drone portion: 39.74256635402115%
Handheld portion: 6.884853322690081%


## Safe Split

In [9]:
train_df.to_csv(EXPORT_PATH + '/train_data_' + SPLIT_ID + '_RS' + str(RANDOM_STATE) + '.csv', index=False)
eval_df.to_csv(EXPORT_PATH + '/eval_data_' + SPLIT_ID + '_RS' + str(RANDOM_STATE) + '.csv', index=False)
test_df.to_csv(EXPORT_PATH + '/test_data_' + SPLIT_ID + '_RS' + str(RANDOM_STATE) + '.csv', index=False)