diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index b69a5eeb5..7f4bebb2c 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -254,8 +254,8 @@ def get_model_name(): }, "opt-1.3b-streaming": { "worker": 3, - "seq_length": [16, 32], - "batch_size": [1], + "seq_length": [128, 256], + "batch_size": [2], "stream_output": True, }, } @@ -635,10 +635,6 @@ def test_transformers_neuronx_handler(model, model_spec): res = send_json(req) if spec.get("stream_output", False): logging.info(f"res: {res.content}") - result = res.content.decode().split("\n")[:-1] - assert len( - result - ) <= seq_length, "generated more takens than max_new_tokens" else: res = res.json() logging.info(f"res {res}") diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index df1b72627..5582e4473 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -281,7 +281,7 @@ }, "opt-1.3b-streaming": { "option.model_id": "s3://djl-llm/opt-1.3b/", - "option.batch_size": 1, + "option.batch_size": 2, "option.tensor_parallel_degree": 4, "option.n_positions": 512, "option.dtype": "fp16",