Skip to content

Commit f044eb8

Browse files
kaiyuxwangruohui
andauthored
Update TensorRT-LLM (NVIDIA#302)
* Update TensorRT-LLM --------- Co-authored-by: wangruohui <12756472+wangruohui@users.noreply.github.com>
1 parent 4de32a8 commit f044eb8

File tree

203 files changed

+12477
-4175
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+12477
-4175
lines changed

README.md

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ TensorRT-LLM
88
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
99
[![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
1010
[![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
11-
[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
1211
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1312

1413
[Architecture](./docs/source/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -173,13 +172,13 @@ Lovelace architectures. Certain limitations may, however, apply.
173172
Various numerical precisions are supported in TensorRT-LLM. The support for
174173
some of those numerical features require specific architectures:
175174

176-
| | FP32 | FP16 | BF16 | FP8 | INT8 | INT4 |
177-
| :--------------------------- | :---- | :---- | :---- | :--- | :--- | :--- |
178-
| Volta (SM70) | Y | Y | N | N | Y | Y |
179-
| Turing (SM75) | Y | Y | N | N | Y | Y |
180-
| Ampere (SM80, SM86) | Y | Y | Y | N | Y | Y |
181-
| Ada-Lovelace (SM89) | Y | Y | Y | Y | Y | Y |
182-
| Hopper (SM90) | Y | Y | Y | Y | Y | Y |
175+
| | FP32 | FP16 | BF16 | FP8 | INT8 | INT4 |
176+
| :------------------ | :--- | :--- | :--- | :--- | :--- | :--- |
177+
| Volta (SM70) | Y | Y | N | N | Y | Y |
178+
| Turing (SM75) | Y | Y | N | N | Y | Y |
179+
| Ampere (SM80, SM86) | Y | Y | Y | N | Y | Y |
180+
| Ada-Lovelace (SM89) | Y | Y | Y | Y | Y | Y |
181+
| Hopper (SM90) | Y | Y | Y | Y | Y | Y |
183182

184183
In this release of TensorRT-LLM, the support for FP8 and quantized data types
185184
(INT8 or INT4) is not implemented for all the models. See the
@@ -217,8 +216,7 @@ The list of supported models is:
217216
* [Bert](examples/bert)
218217
* [Blip2](examples/blip2)
219218
* [BLOOM](examples/bloom)
220-
* [ChatGLM-6B](examples/chatglm6b)
221-
* [ChatGLM2-6B](examples/chatglm2-6b/)
219+
* [ChatGLM](examples/chatglm), including ChatGLM-6B, ChatGLM2-6B, ChatGLM2-6B-32k, ChatGLM3-6B, ChatGLM3-6B-32k
222220
* [Falcon](examples/falcon)
223221
* [GPT](examples/gpt)
224222
* [GPT-J](examples/gptj)
@@ -230,6 +228,7 @@ The list of supported models is:
230228
* [OPT](examples/opt)
231229
* [SantaCoder](examples/gpt)
232230
* [StarCoder](examples/gpt)
231+
* [InternLM](examples/internlm)
233232

234233
## Performance
235234

benchmarks/cpp/gptSessionBenchmark.cpp

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
1919
#include "tensorrt_llm/runtime/gptJsonConfig.h"
2020
#include "tensorrt_llm/runtime/gptSession.h"
21+
#include "tensorrt_llm/runtime/memoryCounters.h"
2122
#include "tensorrt_llm/runtime/tllmLogger.h"
2223

2324
#include <NvInfer.h>
2425
#include <chrono>
2526
#include <cxxopts.hpp>
26-
#include <iostream>
2727
#include <sstream>
2828
#include <string>
2929

@@ -39,14 +39,22 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
3939
std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration,
4040
GptSession::Config& sessionConfig, bool cudaGraphMode)
4141
{
42-
auto const json = GptJsonConfig::parse(dataPath / "config.json");
42+
43+
std::string modelNameHyphen = modelName;
44+
std::filesystem::path jsonFileName = dataPath / "config.json";
45+
if (tc::strStartsWith(modelName, "chatglm"))
46+
{
47+
std::replace(modelNameHyphen.begin(), modelNameHyphen.end(), '_', '-');
48+
jsonFileName = dataPath / (modelNameHyphen + std::string("-config.json"));
49+
}
50+
auto const json = GptJsonConfig::parse(jsonFileName);
4351
auto const modelConfig = json.getModelConfig();
4452
auto const inputPacked = modelConfig.usePackedInput();
4553
SizeType deviceCount{0};
4654
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
4755
auto const worldConfig
4856
= WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
49-
auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
57+
auto const enginePath = dataPath / json.engineFilename(worldConfig, modelNameHyphen);
5058
auto const dtype = modelConfig.getDataType();
5159
auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
5260

@@ -78,10 +86,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
7886
auto constexpr endId = 50256;
7987
auto constexpr padId = 50256;
8088

89+
auto& memoryCounter = MemoryCounters::getInstance();
90+
TLLM_LOG_INFO(memoryCounter.toString());
91+
8192
for (auto const batchSize : batchSizes)
8293
{
8394
try
8495
{
96+
TLLM_LOG_INFO(memoryCounter.toString());
97+
8598
std::vector<SizeType> inputLenghtsHost(batchSize, maxInputLength);
8699
auto inputLenghts
87100
= bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
@@ -99,6 +112,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
99112
inputIds = bufferManager.copyFrom(
100113
inputsHost, ITensor::makeShape({batchSize, maxInputLength}), MemoryType::kGPU);
101114
}
115+
116+
TLLM_LOG_INFO(memoryCounter.toString());
117+
102118
GenerationInput generationInput{
103119
endId, padId, std::move(inputIds), std::move(inputLenghts), inputPacked};
104120

@@ -107,6 +123,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
107123
bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
108124
bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
109125

126+
TLLM_LOG_INFO(memoryCounter.toString());
127+
110128
for (auto r = 0; r < warmUp; ++r)
111129
{
112130
SizeType numSteps = 0;
@@ -118,6 +136,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
118136
}
119137
cudaDeviceSynchronize();
120138

139+
TLLM_LOG_INFO(memoryCounter.toString());
140+
121141
int iterIdx = 0;
122142
float curDuration = 0;
123143
while (iterIdx < numRuns || curDuration / 1000 < duration)
@@ -134,6 +154,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
134154
iterIdx += 1;
135155
curDuration += std::chrono::duration<float, std::milli>(end - start).count();
136156
}
157+
158+
TLLM_LOG_INFO(memoryCounter.toString());
159+
137160
printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000);
138161

139162
if (worldConfig.getRank() == 0)
@@ -159,14 +182,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
159182
// We can ignore the OOM exception and continue the rest of the benchmark
160183
if (worldConfig.getRank() == 0)
161184
{
162-
printf("%s", e.what());
185+
TLLM_LOG_EXCEPTION(e);
163186
printf(
164187
"[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) N/A tokensPerSec N/A\n",
165188
batchSize, maxInputLength, maxNewTokens);
166189
}
167190
continue;
168191
}
169192
}
193+
TLLM_LOG_INFO(memoryCounter.toString());
170194
}
171195
}
172196

@@ -200,8 +224,8 @@ int main(int argc, char* argv[])
200224
options.add_options()("duration", "Minimal duration of iterations to measure in seconds.",
201225
cxxopts::value<int>()->default_value("60"));
202226

203-
options.add_options()(
204-
"num_micro_batches", "Number of micro batches if enabling pipeline parallelism.", cxxopts::value<int>());
227+
options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
228+
options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
205229
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
206230
options.add_options()(
207231
"kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
@@ -281,10 +305,15 @@ int main(int argc, char* argv[])
281305
}
282306

283307
GptSession::Config sessionConfig{0, 0, 0};
284-
// Argument: Number of micro batches
285-
if (result.count("num_micro_batches"))
308+
// Argument: Batch size for context phase
309+
if (result.count("ctx_micro_batch_size"))
310+
{
311+
sessionConfig.ctxMicroBatchSize = result["ctx_micro_batch_size"].as<int>();
312+
}
313+
// Argument: Batch size for generation phase
314+
if (result.count("gen_micro_batch_size"))
286315
{
287-
sessionConfig.numMicroBatches = result["num_micro_batches"].as<int>();
316+
sessionConfig.genMicroBatchSize = result["gen_micro_batch_size"].as<int>();
288317
}
289318
// Argument: Max tokens in paged K-V Cache
290319
if (result.count("max_tokens_in_paged_kvcache"))

benchmarks/python/allowed_configs.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class BuildConfig(BaseModel, extra=Extra.allow):
4848
# default value to be None, not 0 or 1 to prevent misuse
4949
rotary_pct: Optional[float] = None
5050
bias: bool = True
51+
quantization: Optional[str] = None
5152

5253

5354
class ModelConfig(BaseModel):
@@ -121,7 +122,7 @@ class ModelConfig(BaseModel):
121122
max_input_len=512,
122123
max_output_len=200,
123124
builder_opt=None,
124-
use_smooth_quant=True,
125+
quantization="int8_sq_per_tensor",
125126
)),
126127
"gpt_350m_sq_per_token_channel":
127128
ModelConfig(name="gpt_350m_sq_per_token_channel",
@@ -138,9 +139,7 @@ class ModelConfig(BaseModel):
138139
max_input_len=512,
139140
max_output_len=200,
140141
builder_opt=None,
141-
use_smooth_quant=True,
142-
per_token=True,
143-
per_channel=True,
142+
quantization="int8_sq_per_token_channel",
144143
)),
145144
"gpt-next_2b":
146145
ModelConfig(name="gpt-next_2b",
@@ -318,7 +317,7 @@ class ModelConfig(BaseModel):
318317
max_input_len=512,
319318
max_output_len=200,
320319
builder_opt=None,
321-
use_smooth_quant=True)),
320+
quantization="int8_sq_per_tensor")),
322321
"gptj_6b":
323322
ModelConfig(name="gptj_6b",
324323
family="gptj",
@@ -354,7 +353,7 @@ class ModelConfig(BaseModel):
354353
builder_opt=None,
355354
)),
356355
"chatglm_6b":
357-
ModelConfig(name="chatglm_6b",
356+
ModelConfig(name="chatglm-6b",
358357
family="chatglm",
359358
benchmark_type="gpt",
360359
build_config=BuildConfig(
@@ -371,7 +370,7 @@ class ModelConfig(BaseModel):
371370
remove_input_padding=False,
372371
)),
373372
"chatglm2_6b":
374-
ModelConfig(name="chatglm2_6b",
373+
ModelConfig(name="chatglm2-6b",
375374
family="chatglm2",
376375
benchmark_type="gpt",
377376
build_config=BuildConfig(
@@ -387,6 +386,23 @@ class ModelConfig(BaseModel):
387386
builder_opt=None,
388387
remove_input_padding=False,
389388
)),
389+
"chatglm3_6b":
390+
ModelConfig(name="chatglm3-6b",
391+
family="chatglm3",
392+
benchmark_type="gpt",
393+
build_config=BuildConfig(
394+
num_layers=28,
395+
num_heads=32,
396+
hidden_size=4096,
397+
vocab_size=65024,
398+
hidden_act='swiglu',
399+
n_positions=2048,
400+
max_batch_size=256,
401+
max_input_len=512,
402+
max_output_len=200,
403+
builder_opt=None,
404+
remove_input_padding=False,
405+
)),
390406
"bloom_560m":
391407
ModelConfig(name="bloom_560m",
392408
family="bloom",

benchmarks/python/benchmark.py

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,11 @@
1818
from time import time
1919

2020
import torch
21-
from allowed_configs import get_allowed_models
22-
from bert_benchmark import BERTBenchmark
23-
from gpt_benchmark import GPTBenchmark
2421
from mem_monitor import mem_monitor
2522

26-
from tensorrt_llm.logger import logger
27-
2823

2924
def parse_arguments():
25+
from allowed_configs import get_allowed_models
3026
parser = argparse.ArgumentParser(
3127
description='Benchmark TensorRT-LLM models.')
3228
parser.add_argument('-m',
@@ -172,18 +168,7 @@ def parse_arguments():
172168
help=
173169
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
174170
)
175-
parser.add_argument(
176-
'--enable_fp8',
177-
default=False,
178-
action='store_true',
179-
help='Use FP8 Linear layer for LMHead, Attention QKV/Dense, and MLP.')
180-
parser.add_argument(
181-
'--fp8_kv_cache',
182-
default=False,
183-
action="store_true",
184-
help=
185-
'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
186-
)
171+
187172
parser.add_argument('--csv',
188173
default=False,
189174
action="store_true",
@@ -199,11 +184,38 @@ def parse_arguments():
199184
help=
200185
'Use latency-optimized all-reduce for tensor parallelism. Gives better performance with NVLink.'
201186
)
187+
parser.add_argument(
188+
'--strongly_typed',
189+
default=False,
190+
action='store_true',
191+
help=
192+
'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
193+
)
194+
parser.add_argument(
195+
'--quantization',
196+
type=str,
197+
default=None,
198+
choices=[
199+
'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
200+
'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
201+
'int4_weight_only_awq', 'int4_weight_only_gptq'
202+
],
203+
help="Optimize the model with specified quantization recipe")
202204

203205
return parser.parse_args()
204206

205207

206208
def main(args):
209+
# We import tensorrt_llm here because MPI is initialized when
210+
# tensorrt_llm is imported, but mpi4py does not work well with
211+
# the start method `spawn` of Python multiprocessing,
212+
# so we set the start method first, then initialize MPI.
213+
from allowed_configs import get_allowed_models
214+
from bert_benchmark import BERTBenchmark
215+
from gpt_benchmark import GPTBenchmark
216+
217+
from tensorrt_llm.logger import logger
218+
207219
logger.set_level(args.log_level)
208220

209221
# Batch size
@@ -235,10 +247,10 @@ def main(args):
235247
args.max_output_len,
236248
args.max_batch_size,
237249
force_num_layer_1=args.force_num_layer_1,
238-
enable_fp8=args.enable_fp8,
239-
fp8_kv_cache=args.fp8_kv_cache,
240250
enable_cuda_graph=args.enable_cuda_graph,
241-
enable_custom_all_reduce=args.enable_custom_all_reduce)
251+
enable_custom_all_reduce=args.enable_custom_all_reduce,
252+
strongly_typed=args.strongly_typed,
253+
quantization=args.quantization)
242254
elif args.model in get_allowed_models(benchmark_type="bert"):
243255
benchmarker = BERTBenchmark(args.engine_dir,
244256
args.model,
@@ -273,8 +285,8 @@ def main(args):
273285
# Launch a subprocess to monitor memory usage
274286
q1 = Queue() # q1 is used for sending signal to subprocess
275287
q2 = Queue() # q2 is used for receiving results from subprocess
276-
p = Process(target=mem_monitor, args=(q1, q2))
277-
p.start()
288+
mem_monitor_process = Process(target=mem_monitor, args=(q1, q2))
289+
mem_monitor_process.start()
278290

279291
iter_idx = 0
280292
try:
@@ -301,14 +313,14 @@ def main(args):
301313

302314
except Exception as e:
303315
print("Found exception during benchmarking", e.with_traceback())
304-
p.kill()
316+
mem_monitor_process.kill()
305317
raise e
306318
logger.debug("Sending signal to mem monitor process, start")
307319
q1.put(1)
308320
logger.debug("Sending signal to mem monitor process, done")
309321
peak_gpu_used = q2.get()
310322
logger.debug("Get peak gpu memory usage from mem monitor process, done")
311-
p.join()
323+
mem_monitor_process.join()
312324
logger.debug("Memory monitor process joined")
313325

314326
latency = round(sum(latencies) / iter_idx, 3)

0 commit comments

Comments
 (0)