In [1]:
import os

import tqdm
import numpy as np
import tensorflow as tf
import tensorflow_gnn as tfgnn
import tensorflow_ranking as tfr

from tpu_graphs.baselines.layout.data import get_npz_split, get_npz_dataset
from tpu_graphs.baselines.layout.models import ResModel
from gfos.data.utils import load_layout
from gfos.metrics import kendall


In [4]:
data_root_dir = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout\xla\default"
num_configs = 16
max_configs = 1000
batch_size = 8

In [5]:
partition = get_npz_dataset(
    data_root_dir,
    min_train_configs=-1,
    max_train_configs=max_configs,
    cache_dir=None,
)

100%|██████████| 61/61 [00:30<00:00,  2.00it/s]
100%|██████████| 7/7 [00:03<00:00,  1.94it/s]
100%|██████████| 8/8 [00:00<00:00, 10.95it/s]


In [6]:
def _graph_and_label(graph: tfgnn.GraphTensor):
    # Return runtimes divded over large number: only ranking is required. The
    # runtimes are in the 100K range
    label = tf.cast(graph.node_sets["g"]["runtimes"], tf.float32) / 1e7
    return graph, label

valid_ds = (
    partition.validation.get_graph_tensors_dataset(num_configs)
    .batch(batch_size, drop_remainder=False)
    .map(tfgnn.GraphTensor.merge_batch_to_components)
    .map(_graph_and_label)
)

In [2]:
import gzip
import json


args = json.load(gzip.open("../../src/tpu_graphs/output/run_aaa7c9876d5bde19db56594f7334657c.jsonz"))
args["train_curve"]

{'epoch': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 'train_loss': [30.041467666625977,
  29.264562606811523,
  28.696121215820312,
  28.339834213256836,
  28.21565055847168,
  27.840320587158203,
  27.37835121154785,
  26.948945999145508,
  26.4381103515625,
  25.360946655273438,
  24.65704345703125,
  23.751651763916016,
  23.117752075195312,
  22.623207092285156,
  22.0583438873291,
  21.676252365112305,
  22.130577087402344,
  21.463748931884766],
 'train_opa': [0.602922797203064,
  0.6148240566253662,
  0.6387278437614441,
  0.6591513156890869,
  0.6567494869232178,
  0.673766016960144,
  0.6938405632972717,
  0.7094371318817139,
  0.7420185804367065,
  0.7807303667068481,
  0.7961590886116028,
  0.8080283403396606,
  0.813093900680542,
  0.8177865743637085,
  0.8253482580184937,
  0.83106529712677,
  0.825078547000885,
  0.8349705934524536],
 'val_loss': [30.23636817932129,
  28.91695785522461,
  29.183935165405273,
  29.106632232666016,
  28.099651336669922

In [7]:
model = ResModel(16, 119)
sample_graph, = partition.validation.get_graph_tensors_dataset(num_configs).take(1)  # Example graph to invoke `model.forward`.
model.forward(sample_graph, num_configs, backprop=False)
del sample_graph  # No longer need a toy example.


In [8]:
# xla random
# loaded = tf.keras.models.load_model(
#     "../../src/tpu_graphs/output/model_801628c441d4b633a0fe36b72248f8e5/",
#     custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
# )

# xla default
loaded = tf.keras.models.load_model(
    "../../src/tpu_graphs/output/model_5260f25ba9d0eae9c5e563a16848fd08",
    custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
)

# model.forward(subconfigs_graph, num_configs=100, backprop=False)
# # nlp default
# loaded = tf.keras.models.load_model(
#     "../../src/tpu_graphs/output/model_aaa7c9876d5bde19db56594f7334657c",
#     custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
# )


for v, lv in zip(model.variables, loaded.variables):
    v.assign(lv.value())

In [9]:
_INFERENCE_CONFIGS_BATCH_SIZE = 100

test_rankings = []

assert partition.validation.graph_id is not None

for graph in tqdm.tqdm(
    partition.validation.iter_graph_tensors(),
    total=partition.validation.graph_id.shape[-1],
    desc="Inference",
):
    num_configs = graph.node_sets["g"]["runtimes"].shape[-1]
    all_scores = []

    for i in range(0, num_configs, _INFERENCE_CONFIGS_BATCH_SIZE):
        end_i = min(i + _INFERENCE_CONFIGS_BATCH_SIZE, num_configs)
        # Take a cut of the configs.
        node_set_g = graph.node_sets["g"]
        subconfigs_graph = tfgnn.GraphTensor.from_pieces(
            edge_sets=graph.edge_sets,
            node_sets={
                "op": graph.node_sets["op"],
                "nconfig": tfgnn.NodeSet.from_fields(
                    sizes=graph.node_sets["nconfig"].sizes,
                    features={
                        "feats": graph.node_sets["nconfig"]["feats"][
                            :, i:end_i
                        ],
                    },
                ),
                "g": tfgnn.NodeSet.from_fields(
                    sizes=tf.constant([1]),
                    features={
                        "graph_id": node_set_g["graph_id"],
                        "runtimes": node_set_g["runtimes"][:, i:end_i],
                        "kept_node_ratio": node_set_g["kept_node_ratio"],
                    },
                ),
            },
        )
        h = model.forward(
            subconfigs_graph, num_configs=end_i-i, backprop=False
        )
        all_scores.append(h[0])

    all_scores = tf.concat(all_scores, axis=0)
    graph_id = graph.node_sets["g"]["graph_id"][0].numpy().decode()
    sorted_indices = (
        tf.strings.join(tf.strings.as_string(tf.argsort(all_scores)), ";")
        .numpy()
        .decode()
    )
    test_rankings.append((graph_id, sorted_indices))

Inference: 100%|██████████| 7/7 [23:47<00:00, 203.99s/it]


In [14]:
LAYOUT_DIR = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout"
layouts = load_layout(LAYOUT_DIR, model_type="xla", compile_type="default")


In [11]:
with tf.device("/cpu"):
    opa_metric = tfr.keras.metrics.OPAMetric()

In [22]:
from gfos.metrics import topk_error

pred_dict = dict(test_rankings)
scores = []
opas = []


for file in layouts["valid"]:
    filename = os.path.basename(file)[:-4]
    idx = pred_dict[filename].split(";")
    pred = [int(i) for i in idx]
    
    runtime = np.load(file)["config_runtime"]
    gt = np.argsort(runtime)

    score = topk_error(np.array(pred), gt[:len(pred)], top_k=100, index=True)
    # opa = opa_metric(pred[None], gt[:len(pred)][None])
    scores.append(score)
    # opas.append(opa)


In [None]:
scores

In [23]:
np.mean(scores)

0.9857142857142858