In [1]:
import sys
import os
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from numba import jit
from itertools import combinations_with_replacement

sys.path.append(os.path.join(sys.path[0], '../..'))

from data.kcost_dataset import KCostDataSetSplit
from data.io import Writer, Reader
import lsm.cost as CostFunc
from lsm.lsmtype import LSMTree, LSMSystem, Policy

In [2]:
def wl_to_array(wl_dict):
    return (wl_dict['id'], wl_dict['z0'], wl_dict['z1'], wl_dict['q'], wl_dict['w'])

config = Reader.read_config(os.path.join(sys.path[0], '../..', 'config', 'endure.toml'))
reader = Reader(config)
writer = Writer(config)
config

{'project': {'name': 'ENDURE',
  'log_level': 'INFO',
  'data_dir': '/Users/ndhuynh/sandbox/data',
  'experiments': ['CostSurfaceExp']},
 'system': {'B': 4, 'phi': 1, 's': 0.0, 'E': 1024, 'H': 10, 'N': 100000000},
 'exp_config': {'Exp01': {'wl_ids': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14]}},
 'inputs': {'workloads': [{'id': 0,
    'z0': 0.25,
    'z1': 0.25,
    'q': 0.25,
    'w': 0.25},
   {'id': 1, 'z0': 0.97, 'z1': 0.01, 'q': 0.01, 'w': 0.01},
   {'id': 2, 'z0': 0.01, 'z1': 0.97, 'q': 0.01, 'w': 0.01},
   {'id': 3, 'z0': 0.01, 'z1': 0.01, 'q': 0.97, 'w': 0.01},
   {'id': 4, 'z0': 0.01, 'z1': 0.01, 'q': 0.01, 'w': 0.97},
   {'id': 5, 'z0': 0.49, 'z1': 0.49, 'q': 0.01, 'w': 0.01},
   {'id': 6, 'z0': 0.49, 'z1': 0.01, 'q': 0.49, 'w': 0.01},
   {'id': 7, 'z0': 0.49, 'z1': 0.01, 'q': 0.01, 'w': 0.49},
   {'id': 8, 'z0': 0.01, 'z1': 0.49, 'q': 0.49, 'w': 0.01},
   {'id': 9, 'z0': 0.01, 'z1': 0.49, 'q': 0.01, 'w': 0.49},
 

In [3]:
def create_k_levels(levels: int, max_T: int):
    arr = combinations_with_replacement(range(max_T,  0, -1), levels)
    return list(arr)

In [4]:
TMAX = 50
MAX_LEVELS = 16
SAMPLES = 1000000

def gen_file(file_id):
    cf = CostFunc.EndureKHybridCost(**config['system'])
    wls = np.random.rand(SAMPLES, 4)
    wls = np.around(wls / wls.sum(axis=1).reshape(SAMPLES, 1), 3)
    hs = np.around(9.5 * np.random.rand(SAMPLES), 2)
    Ts = np.random.randint(low=2, high=TMAX, size=SAMPLES)

    df = []
    fname = f'train_{file_id}.feather'
    for wl, h, T in tqdm(zip(wls, hs, Ts), total=SAMPLES, desc=fname):
        z0, z1, q, w = wl
        levels = int(cf.L(h, T, True))
        arr = create_k_levels(levels, T - 1)
        arr = random.sample(arr, min(10, len(arr)))
        # tier, level = np.array([T - 1] * levels), np.array([1] * levels)
        for level_assign in arr:
            k = np.pad(level_assign, (0, MAX_LEVELS - len(level_assign)))
            k_cost = cf.calc_cost(h, T, k, z0, z1, q, w)
            row =  {
                'h': h,
                'T': T,
                'z0': z0,
                'z1': z1,
                'q': q,
                'w': w,
                'B': config['system']['B'],
                'phi': config['system']['phi'],
                's': config['system']['s'],
                'E': config['system']['E'],
                'H': config['system']['H'],
                'N': config['system']['N'],
                'k_cost': k_cost,
            }
            for level_idx in range(MAX_LEVELS):
                row[f'K_{level_idx}'] = k[level_idx]
            df.append(row)

    df = pd.DataFrame(df)
    df.to_feather(fname)

In [None]:
NUM_FILES = 20
for idx in range(NUM_FILES):
    gen_file(idx)

train_0.feather:   0%|          | 0/1000000 [00:00<?, ?it/s]