codota
diff --git a/‎README.md‎
Lines changed: 9 additions & 10 deletions b/‎README.md‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 38 additions & 9 deletions b/‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎benchmarks/python/allowed_configs.py‎
Lines changed: 23 additions & 7 deletions b/‎benchmarks/python/allowed_configs.py‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎benchmarks/python/benchmark.py‎
Lines changed: 36 additions & 24 deletions b/‎benchmarks/python/benchmark.py‎
Lines changed: 36 additions & 24 deletions
@@ -8,7 +8,6 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -173,13 +172,13 @@ Lovelace architectures. Certain limitations may, however, apply.
 Various numerical precisions are supported in TensorRT-LLM. The support for
 some of those numerical features require specific architectures:
 
-|                              | FP32  | FP16  | BF16  | FP8  | INT8 | INT4 |
-| :--------------------------- | :---- | :---- | :---- | :--- | :--- | :--- |
-| Volta (SM70)                 | Y     | Y     | N     | N    | Y    | Y    |
-| Turing (SM75)                | Y     | Y     | N     | N    | Y    | Y    |
-| Ampere (SM80, SM86)          | Y     | Y     | Y     | N    | Y    | Y    |
-| Ada-Lovelace (SM89)          | Y     | Y     | Y     | Y    | Y    | Y    |
-| Hopper (SM90)                | Y     | Y     | Y     | Y    | Y    | Y    |
+|                     | FP32 | FP16 | BF16 | FP8  | INT8 | INT4 |
+| :------------------ | :--- | :--- | :--- | :--- | :--- | :--- |
+| Volta (SM70)        | Y    | Y    | N    | N    | Y    | Y    |
+| Turing (SM75)       | Y    | Y    | N    | N    | Y    | Y    |
+| Ampere (SM80, SM86) | Y    | Y    | Y    | N    | Y    | Y    |
+| Ada-Lovelace (SM89) | Y    | Y    | Y    | Y    | Y    | Y    |
+| Hopper (SM90)       | Y    | Y    | Y    | Y    | Y    | Y    |
 
 In this release of TensorRT-LLM, the support for FP8 and quantized data types
 (INT8 or INT4) is not implemented for all the models. See the
@@ -217,8 +216,7 @@ The list of supported models is:
 * [Bert](examples/bert)
 * [Blip2](examples/blip2)
 * [BLOOM](examples/bloom)
-* [ChatGLM-6B](examples/chatglm6b)
-* [ChatGLM2-6B](examples/chatglm2-6b/)
+* [ChatGLM](examples/chatglm), including ChatGLM-6B, ChatGLM2-6B, ChatGLM2-6B-32k, ChatGLM3-6B, ChatGLM3-6B-32k
 * [Falcon](examples/falcon)
 * [GPT](examples/gpt)
 * [GPT-J](examples/gptj)
@@ -230,6 +228,7 @@ The list of supported models is:
 * [OPT](examples/opt)
 * [SantaCoder](examples/gpt)
 * [StarCoder](examples/gpt)
+* [InternLM](examples/internlm)
 
 ## Performance
 
 
@@ -18,12 +18,12 @@
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 
 #include <NvInfer.h>
 #include <chrono>
 #include <cxxopts.hpp>
-#include <iostream>
 #include <sstream>
 #include <string>
 
@@ -39,14 +39,22 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
     std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration,
     GptSession::Config& sessionConfig, bool cudaGraphMode)
 {
-    auto const json = GptJsonConfig::parse(dataPath / "config.json");
+
+    std::string modelNameHyphen = modelName;
+    std::filesystem::path jsonFileName = dataPath / "config.json";
+    if (tc::strStartsWith(modelName, "chatglm"))
+    {
+        std::replace(modelNameHyphen.begin(), modelNameHyphen.end(), '_', '-');
+        jsonFileName = dataPath / (modelNameHyphen + std::string("-config.json"));
+    }
+    auto const json = GptJsonConfig::parse(jsonFileName);
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
     SizeType deviceCount{0};
     TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
     auto const worldConfig
         = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
-    auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
+    auto const enginePath = dataPath / json.engineFilename(worldConfig, modelNameHyphen);
     auto const dtype = modelConfig.getDataType();
     auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
 
@@ -78,10 +86,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
         auto constexpr endId = 50256;
         auto constexpr padId = 50256;
 
+        auto& memoryCounter = MemoryCounters::getInstance();
+        TLLM_LOG_INFO(memoryCounter.toString());
+
         for (auto const batchSize : batchSizes)
         {
             try
             {
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 std::vector<SizeType> inputLenghtsHost(batchSize, maxInputLength);
                 auto inputLenghts
                     = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
@@ -99,6 +112,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     inputIds = bufferManager.copyFrom(
                         inputsHost, ITensor::makeShape({batchSize, maxInputLength}), MemoryType::kGPU);
                 }
+
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 GenerationInput generationInput{
                     endId, padId, std::move(inputIds), std::move(inputLenghts), inputPacked};
 
@@ -107,6 +123,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
 
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 for (auto r = 0; r < warmUp; ++r)
                 {
                     SizeType numSteps = 0;
@@ -118,6 +136,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                 }
                 cudaDeviceSynchronize();
 
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 int iterIdx = 0;
                 float curDuration = 0;
                 while (iterIdx < numRuns || curDuration / 1000 < duration)
@@ -134,6 +154,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     iterIdx += 1;
                     curDuration += std::chrono::duration<float, std::milli>(end - start).count();
                 }
+
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000);
 
                 if (worldConfig.getRank() == 0)
@@ -159,14 +182,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                 // We can ignore the OOM exception and continue the rest of the benchmark
                 if (worldConfig.getRank() == 0)
                 {
-                    printf("%s", e.what());
+                    TLLM_LOG_EXCEPTION(e);
                     printf(
                         "[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) N/A tokensPerSec N/A\n",
                         batchSize, maxInputLength, maxNewTokens);
                 }
                 continue;
             }
         }
+        TLLM_LOG_INFO(memoryCounter.toString());
     }
 }
 
@@ -200,8 +224,8 @@ int main(int argc, char* argv[])
     options.add_options()("duration", "Minimal duration of iterations to measure in seconds.",
         cxxopts::value<int>()->default_value("60"));
 
-    options.add_options()(
-        "num_micro_batches", "Number of micro batches if enabling pipeline parallelism.", cxxopts::value<int>());
+    options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
+    options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()(
         "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
@@ -281,10 +305,15 @@ int main(int argc, char* argv[])
     }
 
     GptSession::Config sessionConfig{0, 0, 0};
-    // Argument: Number of micro batches
-    if (result.count("num_micro_batches"))
+    // Argument: Batch size for context phase
+    if (result.count("ctx_micro_batch_size"))
+    {
+        sessionConfig.ctxMicroBatchSize = result["ctx_micro_batch_size"].as<int>();
+    }
+    // Argument: Batch size for generation phase
+    if (result.count("gen_micro_batch_size"))
     {
-        sessionConfig.numMicroBatches = result["num_micro_batches"].as<int>();
+        sessionConfig.genMicroBatchSize = result["gen_micro_batch_size"].as<int>();
     }
     // Argument: Max tokens in paged K-V Cache
     if (result.count("max_tokens_in_paged_kvcache"))
 
@@ -48,6 +48,7 @@ class BuildConfig(BaseModel, extra=Extra.allow):
     # default value to be None, not 0 or 1 to prevent misuse
     rotary_pct: Optional[float] = None
     bias: bool = True
+    quantization: Optional[str] = None
 
 
 class ModelConfig(BaseModel):
@@ -121,7 +122,7 @@ class ModelConfig(BaseModel):
                     max_input_len=512,
                     max_output_len=200,
                     builder_opt=None,
-                    use_smooth_quant=True,
+                    quantization="int8_sq_per_tensor",
                 )),
     "gpt_350m_sq_per_token_channel":
     ModelConfig(name="gpt_350m_sq_per_token_channel",
@@ -138,9 +139,7 @@ class ModelConfig(BaseModel):
                     max_input_len=512,
                     max_output_len=200,
                     builder_opt=None,
-                    use_smooth_quant=True,
-                    per_token=True,
-                    per_channel=True,
+                    quantization="int8_sq_per_token_channel",
                 )),
     "gpt-next_2b":
     ModelConfig(name="gpt-next_2b",
@@ -318,7 +317,7 @@ class ModelConfig(BaseModel):
                                          max_input_len=512,
                                          max_output_len=200,
                                          builder_opt=None,
-                                         use_smooth_quant=True)),
+                                         quantization="int8_sq_per_tensor")),
     "gptj_6b":
     ModelConfig(name="gptj_6b",
                 family="gptj",
@@ -354,7 +353,7 @@ class ModelConfig(BaseModel):
                     builder_opt=None,
                 )),
     "chatglm_6b":
-    ModelConfig(name="chatglm_6b",
+    ModelConfig(name="chatglm-6b",
                 family="chatglm",
                 benchmark_type="gpt",
                 build_config=BuildConfig(
@@ -371,7 +370,7 @@ class ModelConfig(BaseModel):
                     remove_input_padding=False,
                 )),
     "chatglm2_6b":
-    ModelConfig(name="chatglm2_6b",
+    ModelConfig(name="chatglm2-6b",
                 family="chatglm2",
                 benchmark_type="gpt",
                 build_config=BuildConfig(
@@ -387,6 +386,23 @@ class ModelConfig(BaseModel):
                     builder_opt=None,
                     remove_input_padding=False,
                 )),
+    "chatglm3_6b":
+    ModelConfig(name="chatglm3-6b",
+                family="chatglm3",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=28,
+                    num_heads=32,
+                    hidden_size=4096,
+                    vocab_size=65024,
+                    hidden_act='swiglu',
+                    n_positions=2048,
+                    max_batch_size=256,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    remove_input_padding=False,
+                )),
     "bloom_560m":
     ModelConfig(name="bloom_560m",
                 family="bloom",
 
@@ -18,15 +18,11 @@
 from time import time
 
 import torch
-from allowed_configs import get_allowed_models
-from bert_benchmark import BERTBenchmark
-from gpt_benchmark import GPTBenchmark
 from mem_monitor import mem_monitor
 
-from tensorrt_llm.logger import logger
-
 
 def parse_arguments():
+    from allowed_configs import get_allowed_models
     parser = argparse.ArgumentParser(
         description='Benchmark TensorRT-LLM models.')
     parser.add_argument('-m',
@@ -172,18 +168,7 @@ def parse_arguments():
         help=
         'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
     )
-    parser.add_argument(
-        '--enable_fp8',
-        default=False,
-        action='store_true',
-        help='Use FP8 Linear layer for LMHead, Attention QKV/Dense, and MLP.')
-    parser.add_argument(
-        '--fp8_kv_cache',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
-    )
+
     parser.add_argument('--csv',
                         default=False,
                         action="store_true",
@@ -199,11 +184,38 @@ def parse_arguments():
         help=
         'Use latency-optimized all-reduce for tensor parallelism. Gives better performance with NVLink.'
     )
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action='store_true',
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
+    parser.add_argument(
+        '--quantization',
+        type=str,
+        default=None,
+        choices=[
+            'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
+            'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
+            'int4_weight_only_awq', 'int4_weight_only_gptq'
+        ],
+        help="Optimize the model with specified quantization recipe")
 
     return parser.parse_args()
 
 
 def main(args):
+    # We import tensorrt_llm here because MPI is initialized when
+    # tensorrt_llm is imported, but mpi4py does not work well with
+    # the start method `spawn` of Python multiprocessing,
+    # so we set the start method first, then initialize MPI.
+    from allowed_configs import get_allowed_models
+    from bert_benchmark import BERTBenchmark
+    from gpt_benchmark import GPTBenchmark
+
+    from tensorrt_llm.logger import logger
+
     logger.set_level(args.log_level)
 
     # Batch size
@@ -235,10 +247,10 @@ def main(args):
             args.max_output_len,
             args.max_batch_size,
             force_num_layer_1=args.force_num_layer_1,
-            enable_fp8=args.enable_fp8,
-            fp8_kv_cache=args.fp8_kv_cache,
             enable_cuda_graph=args.enable_cuda_graph,
-            enable_custom_all_reduce=args.enable_custom_all_reduce)
+            enable_custom_all_reduce=args.enable_custom_all_reduce,
+            strongly_typed=args.strongly_typed,
+            quantization=args.quantization)
     elif args.model in get_allowed_models(benchmark_type="bert"):
         benchmarker = BERTBenchmark(args.engine_dir,
                                     args.model,
@@ -273,8 +285,8 @@ def main(args):
         # Launch a subprocess to monitor memory usage
         q1 = Queue()  # q1 is used for sending signal to subprocess
         q2 = Queue()  # q2 is used for receiving results from subprocess
-        p = Process(target=mem_monitor, args=(q1, q2))
-        p.start()
+        mem_monitor_process = Process(target=mem_monitor, args=(q1, q2))
+        mem_monitor_process.start()
 
         iter_idx = 0
         try:
@@ -301,14 +313,14 @@ def main(args):
 
         except Exception as e:
             print("Found exception during benchmarking", e.with_traceback())
-            p.kill()
+            mem_monitor_process.kill()
             raise e
         logger.debug("Sending signal to mem monitor process, start")
         q1.put(1)
         logger.debug("Sending signal to mem monitor process, done")
         peak_gpu_used = q2.get()
         logger.debug("Get peak gpu memory usage from mem monitor process, done")
-        p.join()
+        mem_monitor_process.join()
         logger.debug("Memory monitor process joined")
 
         latency = round(sum(latencies) / iter_idx, 3)