## Setup
1. Place this file into a folder that contains the dataset folders you want to merge (and no other folders).
2. Specify the name fo the new dataset (`NEW_DATASET_NAME`). This will create a new folder for the merged dataset. This folder will be overwritten on every run.
3. Specify how to shard the new merged dataset (how many samples to save per file: `NEW_SAMPLES_PER_FILE`).


> NOTE:
> all variable definitions (names, dimensions, domains etc) need to match across all datasets!

> NOTE:
> domains of the performance attributes will be updated to match the merged data!

In [None]:
NEW_DATASET_NAME = "merged"
NEW_SAMPLES_PER_FILE = 100

# -----------------------------------------------------------------------------
# RUN!
No need to modify anything else in this file, but you may want to inspect the outputs and printouts to check everything is as expected.

In [None]:
from aixd.data.encoders import json_load
import os
import pandas as pd
from aixd.data import Dataset

In [None]:
root_folder = os.getcwd()
dataset_names = [name for name in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, name)) and name != NEW_DATASET_NAME]
print(f"Found following subfolders: {dataset_names}")

#### Merge data
Reads all data and combines them into one dataframe 
(all datasets, design parameters+performance attributes)

In [None]:
def load_df(root_folder, dataset_name):
    # Load old sharded data from pickled dataframes

    # DPs
    directory = os.path.join(root_folder, dataset_name, "design_parameters")
    df_dp_all = []

    for filename in os.listdir(directory):
        if filename.endswith(".pkl"):
            filepath = os.path.join(directory, filename)
            df = pd.read_pickle(filepath)
            df_dp_all.append(df)

    df_dp_all = pd.concat(df_dp_all, axis=0)

    # PAs
    directory = os.path.join(root_folder, dataset_name, "performance_attributes")
    df_pa_all = []

    for filename in os.listdir(directory):
        if filename.endswith(".pkl"):
            filepath = os.path.join(directory, filename)
            df = pd.read_pickle(filepath)
            df_pa_all.append(df)

    df_pa_all = pd.concat(df_pa_all, axis=0)
    df_all = pd.merge(df_dp_all, df_pa_all, how="inner", on=["uid"])
    df_all = df_all.drop(columns=["uid"])
    return df_all

In [None]:
dfs = []
for dataset_name in dataset_names:
    df = load_df(root_folder, dataset_name)
    dfs.append(df)

df_all = pd.concat(dfs)
    


Preview the dataframe containing the merged data. 
The last column "error" is for internal purposes, not part of the variables.

In [None]:
df_all

#### Dataset object

Restores a Dataset object from one of the datasets' json files


In [None]:
picked_dataset_object_path = os.path.join(root_folder, dataset_names[0], "dataset_object.json")
dataset_temp = Dataset.from_dataset_object(picked_dataset_object_path)


In [None]:
dataset_new = Dataset(  name=NEW_DATASET_NAME,
                        root_path=root_folder,
                        file_format="json",
                        design_par=dataset_temp.design_par,
                        perf_attributes=dataset_temp.perf_attributes,
                        overwrite= True)


In [None]:
dataset_temp.summary_dataobjects()

Import data from dataframe

In [None]:
dataset_new.import_data_from_df(df_all, samples_perfile=NEW_SAMPLES_PER_FILE, flag_fromscratch=True)