In [1]:
import os

import tqdm
import numpy as np
import tensorflow as tf
import tensorflow_gnn as tfgnn
import tensorflow_ranking as tfr

from tpu_graphs.baselines.layout.data import get_npz_split, get_npz_dataset
from tpu_graphs.baselines.layout.models import ResModel
from gfos.data.utils import load_layout
from gfos.metrics import metric_for_layout_collections


In [65]:
data_root_dir = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout\nlp\default\valid"
num_configs = 16
max_configs = 1000
batch_size = 8

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [88]:
valid_dataset = get_npz_split(
    data_root_dir,
    min_configs=num_configs,
    max_configs=2000,
    cache_dir=None,
)

100%|██████████| 20/20 [00:14<00:00,  1.41it/s]


In [89]:
def _graph_and_label(graph: tfgnn.GraphTensor):
    # Return runtimes divded over large number: only ranking is required. The
    # runtimes are in the 100K range
    label = tf.cast(graph.node_sets["g"]["runtimes"], tf.float32) / 1e7
    return graph, label

valid_ds = (
    valid_dataset.get_graph_tensors_dataset(num_configs)
    .batch(batch_size, drop_remainder=False)
    .map(tfgnn.GraphTensor.merge_batch_to_components)
    .map(_graph_and_label)
)

In [82]:
for graph, label in valid_ds:
    print(graph)
    break

GraphTensor(
  context=Context(features={}, sizes=[1 1 1 1 1 1 1 1], shape=(), indices_dtype=tf.int32),
  node_set_names=['op', 'nconfig', 'g'],
  edge_set_names=['config', 'feed', 'g_op', 'g_config', 'sampled_config', 'sampled_feed'])


In [92]:
import gzip
import json


args = json.load(gzip.open("../../src/tpu_graphs/output/run_aaa7c9876d5bde19db56594f7334657c.jsonz"))["args"]

model = ResModel(16, 101)
sample_graph, = valid_dataset.get_graph_tensors_dataset(num_configs).take(1)  # Example graph to invoke `model.forward`.
model.forward(sample_graph, num_configs)
del sample_graph  # No longer need a toy example.

# xla random
# loaded = tf.keras.models.load_model(
#     "../../src/tpu_graphs/output/model_801628c441d4b633a0fe36b72248f8e5/",
#     custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
# )

# xla default
# loaded = tf.keras.models.load_model(
#     "../../src/tpu_graphs/output/model_5260f25ba9d0eae9c5e563a16848fd08",
#     custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
# )

# model.forward(subconfigs_graph, num_configs=100, backprop=False)
# # nlp default
loaded = tf.keras.models.load_model(
    "../../src/tpu_graphs/output/model_aaa7c9876d5bde19db56594f7334657c",
    custom_objects={"opa_metric": tfr.keras.metrics.OPAMetric},
)


# for v, lv in zip(model.variables, loaded.variables):
#     v.assign(lv.value())

In [94]:
target_vars = model.trainable_variables
source_vars = loaded.trainable_variables
assert len(target_vars) == len(source_vars)
for tv, sv in zip(target_vars, source_vars):
    assert sv.shape == tv.shape, f"{sv.shape} != {tv.shape}"
    tv.assign(sv)

AssertionError: (80, 32) != (190, 32)

In [75]:
_INFERENCE_CONFIGS_BATCH_SIZE = 100

test_rankings = []

assert valid_dataset.graph_id is not None

for graph in tqdm.tqdm(
    valid_dataset.iter_graph_tensors(),
    total=valid_dataset.graph_id.shape[-1],
    desc="Inference",
):
    num_configs = graph.node_sets["g"]["runtimes"].shape[-1]
    all_scores = []

    for i in range(0, num_configs, _INFERENCE_CONFIGS_BATCH_SIZE):
        end_i = min(i + _INFERENCE_CONFIGS_BATCH_SIZE, num_configs)
        # Take a cut of the configs.
        node_set_g = graph.node_sets["g"]
        subconfigs_graph = tfgnn.GraphTensor.from_pieces(
            edge_sets=graph.edge_sets,
            node_sets={
                "op": graph.node_sets["op"],
                "nconfig": tfgnn.NodeSet.from_fields(
                    sizes=graph.node_sets["nconfig"].sizes,
                    features={
                        "feats": graph.node_sets["nconfig"]["feats"][
                            :, i:end_i
                        ],
                    },
                ),
                "g": tfgnn.NodeSet.from_fields(
                    sizes=tf.constant([1]),
                    features={
                        "graph_id": node_set_g["graph_id"],
                        "runtimes": node_set_g["runtimes"][:, i:end_i],
                        "kept_node_ratio": node_set_g["kept_node_ratio"],
                    },
                ),
            },
        )
        h = model.forward(
            subconfigs_graph, num_configs=end_i-i, backprop=False
        )
        all_scores.append(h[0])

    all_scores = tf.concat(all_scores, axis=0)
    graph_id = graph.node_sets["g"]["graph_id"][0].numpy().decode()
    sorted_indices = (
        tf.strings.join(tf.strings.as_string(tf.argsort(all_scores)), ";")
        .numpy()
        .decode()
    )
    test_rankings.append((graph_id, sorted_indices))

Inference:   0%|          | 0/20 [00:00<?, ?it/s]


AttributeError: 'ResModel' object has no attribute 'forward'

In [None]:
LAYOUT_DIR = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout"
layouts = load_layout(LAYOUT_DIR, model_type="nlp", compile_type="default")


In [None]:
opa_metric = tfr.keras.metrics.OPAMetric()

In [None]:
pred_dict = dict(test_rankings)
scores = []
opas = []


for file in layouts["valid"]:
    filename = os.path.basename(file)[:-4]
    idx = pred_dict[filename].split(";")
    pred = [int(i) for i in idx]
    
    runtime = np.load(file)["config_runtime"]
    gt = np.argsort(runtime)

    score = metric_for_layout_collections(pred, gt[:len(pred)])
    opa = opa_metric(pred, gt)
    scores.append(score)
    opas.append(opa)


In [None]:
gt

In [None]:
scores

In [None]:
np.mean(scores)