In [10]:
import warnings
from pathlib import Path

import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold

from gfos.data.utils import load_layout
from gfos.data.dataset import sample_configs
from gfos.utils.misc import seed_everything

SEED = 42
LAYOUT_DIR = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout"
# LAYOUT_DIR = "../../input/npz_all/npz/layout/"
SOURCE = "xla"
SEARCH = "default"
MAX_CONFIGS = 10240
NUM_FOLDS = 10
OUTPUT_ROOT = "../../data/indices_combine"


data_root = Path(OUTPUT_ROOT)
data_root.mkdir(exist_ok=True)

seed_everything(SEED)
warnings.filterwarnings("ignore")

In [20]:
out_dir = Path(LAYOUT_DIR) / SOURCE / SEARCH / "combined" / "train"
out_dir.mkdir(exist_ok=True, parents=True)

In [11]:
default_files = load_layout(LAYOUT_DIR, model_type=SOURCE, compile_type=SEARCH)
random_files = load_layout(LAYOUT_DIR, model_type=SOURCE, compile_type="random")


In [12]:
random_id2path = {Path(p).stem: p for p in random_files["train"]}

In [21]:
for file in tqdm(default_files["train"]):
    model_id = Path(file).stem
    random_path = random_id2path[model_id]

    default_data = dict(np.load(file))
    random_data = np.load(random_path)

    default_data["node_config_feat"] = np.concatenate(
        [default_data["node_config_feat"], random_data["node_config_feat"]], axis=0
    )
    default_data["config_runtime"] = np.concatenate(
        [default_data["config_runtime"], random_data["config_runtime"]]
    )
    np.savez_compressed(out_dir / f"{model_id}.npz", **default_data)    

 16%|█▋        | 10/61 [02:14<07:09,  8.43s/it]

In [None]:
for fold, (train_files, valid_files) in enumerate(10):
    fold_dir = data_root / f"{SOURCE}_{SEARCH}" / f"{fold}"
    train_dir = fold_dir / "train"
    train_dir.mkdir(exist_ok=True, parents=True)
    valid_dir = fold_dir / "valid"
    valid_dir.mkdir(exist_ok=True, parents=True)

    for file in train_files:
        model_id = Path(file).stem
        npz = np.load(file)
        config_runtime = npz["config_runtime"]
        
        c = len(config_runtime)
        mc = min(MAX_CONFIGS, c) if MAX_CONFIGS > 0 else c
        third = MAX_CONFIGS // 3

        sorted_indices = np.argsort(config_runtime)
        
        # if third * NUM_FOLDS > c or 2 * (third * NUM_FOLDS) > c:
        #     third = mc // 3
        keep_indices = np.concatenate(
            [
                sorted_indices[:third],  # Good configs.
                sorted_indices[-third:],  # Bad configs.
                np.random.choice(
                    sorted_indices[third:-third],
                    mc - 2 * third,
                ),
            ]
        )

        np.save(train_dir / f"{model_id}.npy", keep_indices)

    for file in valid_files:
        model_id = Path(file).stem
        all_indices = np.arange(len(np.load(file)["config_runtime"]))
        np.save(valid_dir / f"{model_id}.npy", all_indices)