# Data Split

In [8]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle

In [10]:
source_trace = pd.read_pickle("../../Data/Trace/source_trace_c_l.pkl")
target_trace = pd.read_pickle("../../Data/Trace/target_trace_c_l.pkl")
trace_dict = {"source": source_trace,
              "target": target_trace}

split_dict = {"811split": [0.8, 0.1, 0.1],
              "641620split": [0.64, 0.16, 0.2]}

order_dict = {"sorted": True,
              "random": False}

training_data_path = "../../Data/Training/"

## Split functions

In [27]:
def split_data(data, ratio=[0.8, 0.1, 0.1], sort=True):
    data_size = data.shape[0]
    train_size = int(ratio[0] * data_size)
    val_size = int(ratio[1] * data_size)
    test_size = data_size - train_size - val_size
    if sort:
        data_sorted = data.sort_values(by='Start Time', ascending=True)
    else:
        data_sorted = data.sample(frac=1)
    data_train = data_sorted[:train_size]
    data_val = data_sorted[train_size: train_size+val_size]
    data_test = data_sorted[train_size+val_size:]
    return {"train": data_train,
            "val": data_val,
            "test": data_test}


def split_data_alt(data, ratio=[0.8, 0.1, 0.1]):
    data_size = data.shape[0]
    train_size = int(ratio[0] * data_size)
    test_size = int(ratio[2] * data_size)
    data_sorted = data.sort_values(by='Start Time', ascending=True)
    data_test = data_sorted[-test_size:]   
    data_rest = data_sorted[:-test_size]
    data_rest = data_rest.sample(frac=1)
    data_train = data_rest[:train_size]
    data_val = data_rest[train_size:]
    return {"train": data_train,
            "val": data_val,
            "test": data_test}


def split_data_scale_test(data, test_ratio=0.2, scale=0.1, train_ratio=0.8, sort=True):
    data_size = data.shape[0]
    scale_size = int(scale * data_size)
    test_size = int(test_ratio * data_size)
    train_size = int(train_ratio * scale_size)
    if sort:
        data_sorted = data.sort_values(by='Start Time', ascending=True)
    else:
        data_sorted = data.sample(frac=1)
    data_test = data_sorted[-test_size:]
    data_rest = data_sorted[:scale_size]
    data_train = data_rest[:train_size]
    data_val = data_rest[train_size:]
    return {"train": data_train,
            "val": data_val,
            "test": data_test}


def split_data_scale_test_alt(data, test_ratio=0.2, scale=0.1, train_ratio=0.8):
    data_size = data.shape[0]
    scale_size = int(scale * data_size)
    test_size = int(test_ratio * data_size)
    train_size = int(train_ratio * scale_size)
    data_sorted = data.sort_values(by='Start Time', ascending=True)
    data_test = data_sorted[-test_size:]
    data_rest = data_sorted[:scale_size]
    data_rest = data_rest.sample(frac=1)
    data_train = data_rest[:train_size]
    data_val = data_rest[train_size:]
    return {"train": data_train,
            "val": data_val,
            "test": data_test}

## Split Data

In [25]:
for sorting in order_dict:
    for split_pattern in split_dict:
        for trace_type in trace_dict:
            data_sets = split_data(trace_dict[trace_type],
                                   ratio=split_dict[split_pattern],
                                   sort=order_dict[sorting])
            for single_set in data_sets:
                file_name = "_".join([trace_type, single_set, sorting + ".pkl"])
                data_sets[single_set].to_pickle(training_data_path + split_pattern + "/" + file_name)

In [26]:
for split_pattern in split_dict:
    for trace_type in trace_dict:
        data_sets = split_data_alt(trace_dict[trace_type],
                                   ratio=split_dict[split_pattern],
                                   sort=order_dict[sorting])
        for single_set in data_sets:
            file_name = "_".join([trace_type, single_set, "alt.pkl"])
            data_sets[single_set].to_pickle(training_data_path + split_pattern + "/" + file_name)

## Scale test split

In [30]:
scale_list = [0.01, 0.05, 0.1, 0.5]
training_data_path = "../../Data/Training/scale/"
for scale in scale_list:
    for trace_type in trace_dict:
        data_sets = split_data_scale_test(trace_dict[trace_type], test_ratio=0.2, scale=scale,
                                          train_ratio=0.8, sort=True)
        for single_set in data_sets:
                file_name = "_".join([trace_type, single_set, str(scale) + ".pkl"])
                data_sets[single_set].to_pickle(training_data_path + file_name)