google · yeandy · May 16, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
diff --git a/MaxText/common_types.py b/MaxText/common_types.py
@@ -31,11 +31,16 @@
 ScanIn = partitioning.ScanIn
 
 AxisNames = tuple[str, ...]
+AxisIdxes = tuple[int, ...]
 
 BATCH = "activation_batch"
 LENGTH = "activation_length"
 HEAD = "activation_heads"
 D_KV = "activation_kv"
+CACHE_BATCH = "cache_batch"
+CACHE_SEQUENCE = "cache_sequence"
+CACHE_HEADS = "cache_heads"
+CACHE_KV = "cache_kv"
 
 MODEL_MODE_AUTOREGRESSIVE = "autoregressive"
 MODEL_MODE_PREFILL = "prefill"

@@ -282,6 +282,15 @@ inference_microbenchmark_prefill_lengths: "64,128,256,512,1024"
 inference_microbenchmark_stages: "prefill,generate"
 inference_microbenchmark_loop_iters: 10
 inference_microbenchmark_log_file_path: ""
+inference_metadata_file: "" # path to a json file
+
+# KV Cache layout control
+# Logical layout: 0,1,2,3 ; CACHE_BATCH, CACHE_SEQUENCE, CACHE_HEADS, CACHE_KV
+# Default layout: 1,2,0,3 ; CACHE_SEQUENCE, CACHE_HEADS, CACHE_BATCH, CACHE_KV
+prefill_key_axis_order: "1,2,0,3"
+prefill_value_axis_order: "1,2,0,3"
+ar_key_axis_order: "1,2,0,3"
+ar_value_axis_order: "1,2,0,3"
 
 # Checkpoint Structured logging
 enable_checkpoint_cloud_logger: False

@@ -20,6 +20,9 @@
 import json
 import sys
 
+from collections.abc import MutableMapping
+from typing import Any, Dict, Optional
+
 from jetstream.engine import token_utils
 
 import max_utils
@@ -170,10 +173,25 @@ def collate_results(config, results, model_size, cache_size, num_model_params, i
   return results
 
 
-def write_results(results, filename):
+def flatten_dict(dictionary, prefix='', sep='_'):
+  results = []
+  for k, v in dictionary.items():
+    new_key = str(prefix) + sep + str(k) if prefix else k
+    if isinstance(v, MutableMapping):
+      results.extend(flatten_dict(v, new_key, sep=sep).items())
+    else:
+      results.append((new_key, v))
+  return dict(results)
+
+
+def write_results(results, filename, flatten_microbenchmark_results):
+  """Write the results microbenchmark results to a json file."""
+  if flatten_microbenchmark_results:
+    results['flattened_results'] = flatten_dict(results)
   if filename != "":
     with open(filename, "w", encoding="utf-8") as f:
       json.dump(results, f, indent=2)
+  return results
 
 
 def print_results_for_analyze(results):
@@ -218,7 +236,7 @@ def summarize_prefill_result(engine, params, tokens, true_length):
   }
 
 
-def main(config):
+def main(config, inference_metadata: Optional[Dict[str, Any]] = None):
   engine = maxengine.MaxEngine(config)
   params = engine.load_params()
   prefill_lengths = [int(l) for l in config.inference_microbenchmark_prefill_lengths.split(",")]
@@ -277,8 +295,17 @@ def main(config):
       config, engine, params, decode_state, engine.max_concurrent_decodes, cache_size, model_size, benchmark_loop_iters)
 
   results = collate_results(config, benchmark_results, model_size, cache_size, num_model_params)
-  write_results(results, filename=config.inference_microbenchmark_log_file_path)
   print_results_for_analyze(results)
+  if inference_metadata:
+    flatten_microbenchmark_results = pyconfig.string_to_bool(inference_metadata.get('flatten_microbenchmark_results', 'false'))
+  else:
+    flatten_microbenchmark_results = 'false'
+  results = write_results(
+    results,
+    filename=config.inference_microbenchmark_log_file_path,
+    flatten_microbenchmark_results=flatten_microbenchmark_results
+  )
+  return results
 
 
 if __name__ == "__main__":

@@ -0,0 +1,154 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Sweep across inference microbenchmarks."""
+
+import os
+import sys
+import json
+import jsonlines
+import inference_microbenchmark
+import max_utils
+import pyconfig
+from jax._src.lib import xla_extension
+
+
+def main():
+  """
+  User needs to set the config's inference_metadata_file, which is a path to a
+  json file.
+
+  This json should contain the following keys:
+    - key_value_axis_order_product_id_list: comma separated string of key_value_axis_order_product_id
+    - prefill_key_axis_order_list: comma delimited string of prefill_key_axis_order
+    - prefill_value_axis_order_list: comma delimited string of prefill_value_axis_order
+    - ar_key_axis_order_list: comma delimited string of ar_key_axis_order
+    - ar_value_axis_order_list: comma delimited string of ar_value_axis_order
+    - accelerator: name of the accelerator
+    - flatten_microbenchmark_results: Whether or not to flatten results. Should
+      be true
+  """
+  pyconfig.initialize(sys.argv)
+  config = pyconfig.config
+
+  with open(config.inference_metadata_file, encoding='utf-8') as json_file:
+    inference_metadata = json.load(json_file)
+    print(f"inference_metadata: {inference_metadata}")
+
+  key_value_axis_order_product_id_list = inference_metadata['key_value_axis_order_product_id_list'].split(':')
+  prefill_key_axis_order_list = inference_metadata['prefill_key_axis_order_list'].split(':')
+  prefill_value_axis_order_list = inference_metadata['prefill_value_axis_order_list'].split(':')
+  ar_key_axis_order_list = inference_metadata['ar_key_axis_order_list'].split(':')
+  ar_value_axis_order_list = inference_metadata['ar_value_axis_order_list'].split(':')
+
+  results = []
+  for (
+    key_value_axis_order_product_id,
+    prefill_key_axis_order,
+    prefill_value_axis_order,
+    ar_key_axis_order,
+    ar_value_axis_order,
+  ) in zip(
+    key_value_axis_order_product_id_list,
+    prefill_key_axis_order_list,
+    prefill_value_axis_order_list,
+    ar_key_axis_order_list,
+    ar_value_axis_order_list,
+  ):
+    print(f"key_value_axis_order_product_id {key_value_axis_order_product_id}")
+    print(f"prefill_key_axis_order {prefill_key_axis_order}")
+    print(f"prefill_value_axis_order {prefill_value_axis_order}")
+    print(f"ar_key_axis_order {ar_key_axis_order}")
+    print(f"ar_value_axis_order {ar_value_axis_order}")
+
+    # Manually update the config
+    # Don't set key_value_axis_order_product_id; otherwise it will recompute
+    # ar_key_axis_order and ar_value_axis_order
+    quant = 'bf16' if not config.quantization else config.quantization
+    run_name = (
+      f"{inference_metadata['accelerator']}-{config.model_name}-"
+      f"{quant}-{key_value_axis_order_product_id}-{prefill_key_axis_order}-"
+      f"{ar_key_axis_order}"
+    )
+    tensorboard_dir = os.path.join(config.base_output_directory, run_name, "tensorboard", "")
+    checkpoint_dir = os.path.join(config.base_output_directory, run_name, "checkpoint", "")
+    metrics_dir = os.path.join(config.base_output_directory, run_name, "metrics", "")
+    pyconfig._config.keys['prefill_key_axis_order'] = prefill_key_axis_order # pylint: disable=protected-access
+    pyconfig._config.keys['prefill_value_axis_order'] = prefill_value_axis_order # pylint: disable=protected-access
+    pyconfig._config.keys['ar_key_axis_order'] = ar_key_axis_order # pylint: disable=protected-access
+    pyconfig._config.keys['ar_value_axis_order'] = ar_value_axis_order # pylint: disable=protected-access
+    pyconfig._config.keys['tensorboard_dir'] = tensorboard_dir # pylint: disable=protected-access
+    pyconfig._config.keys['checkpoint_dir'] = checkpoint_dir # pylint: disable=protected-access
+    pyconfig._config.keys['metrics_dir'] = metrics_dir # pylint: disable=protected-access
+    pyconfig._config.keys['run_name'] = run_name # pylint: disable=protected-access
+    max_utils.write_config_raw_keys_for_gcs(pyconfig._config.keys) # pylint: disable=protected-access
+
+    # Prepare metadata (dimensions) json for XLML
+    dimensions_json = {
+      "base_output_directory": config.base_output_directory,
+      "model_name": config.model_name,
+      "tokenizer": config.tokenizer_path,
+      "weight_dtype": config.weight_dtype,
+      "inference_microbenchmark_prefill_lengths": f"{config.inference_microbenchmark_prefill_lengths}",
+      "inference_microbenchmark_stages": config.inference_microbenchmark_stages,
+      "inference_microbenchmark_loop_iters": f"{config.inference_microbenchmark_loop_iters}",
+      "max_prefill_predict_length": f"{config.max_prefill_predict_length}",
+      "max_target_length": f"{config.max_target_length}",
+      "per_device_batch_size": f"{config.per_device_batch_size}",
+      "ici_fsdp_parallelism": f"{config.ici_fsdp_parallelism}",
+      "ici_autoregressive_parallelism": f"{config.ici_autoregressive_parallelism}",
+      "ici_tensor_parallelism": f"{config.ici_tensor_parallelism}",
+      "profiler": f"{config.profiler}",
+      "scan_layers": f"{config.scan_layers}",
+      "quantization": config.quantization,
+      "quantize_kvcache": f"{config.quantize_kvcache}",
+      "attention": config.attention,
+      "key_value_axis_order_product_id": f"{key_value_axis_order_product_id}",
+      "prefill_key_axis_order": f"{prefill_key_axis_order}",
+      "prefill_value_axis_order": f"{prefill_value_axis_order}",
+      "ar_key_axis_order": f"{ar_key_axis_order}",
+      "ar_value_axis_order": f"{ar_value_axis_order}",
+      "config_json_string": json.dumps(
+          pyconfig._config.keys, # pylint: disable=protected-access
+          default=lambda x: f"<<non-serializable: {type(x).__qualname__}>>"
+        )
+    }
+    dimensions_json = {
+      **dimensions_json,
+      **inference_metadata,
+    }
+    try:
+      microbenchmark_results = inference_microbenchmark.main(config, inference_metadata=inference_metadata)
+      metrics = microbenchmark_results['flattened_results']
+      metrics = {k.lower(): v for k, v in metrics.items()}
+      dimensions_json['oom'] = 'False'
+    except xla_extension.XlaRuntimeError:
+      # OOM
+      metrics = {}
+      dimensions_json['oom'] = 'True'
+
+    final = {'metrics': metrics, 'dimensions': dimensions_json}
+    print(f"Result: {final}")
+    results.append(final)
+
+  print(f"All results {results}")
+  path = 'inference_microbenchmark_sweep_results.jsonl'
+  with jsonlines.open(path, mode="w") as writer:
+    writer.write_all(results)
+
+
+if __name__ == "__main__":
+  main()