remove usage of SERVING_LOAD_MODELS and OPTION_MODEL_ID in examples/d…

…ocs/tests
deepjavalibrary · Jun 6, 2024 · e6767cc · e6767cc
1 parent b178dc9
commit e6767cc
Show file tree

Hide file tree

Showing 7 changed files with 21 additions and 32 deletions.
diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
@@ -562,7 +562,7 @@ jobs:
         working-directory: tests/integration
         run: |
           rm -rf models
-          echo -en "SERVING_LOAD_MODELS=test::MPI=/opt/ml/model\nOPTION_MAX_ROLLING_BATCH_SIZE=2\nOPTION_OUTPUT_FORMATTER=jsonlines\nOPTION_TENSOR_PARALLEL_DEGREE=1\nOPTION_MODEL_ID=gpt2\nOPTION_TASK=text-generation\nOPTION_ROLLING_BATCH=lmi-dist" > docker_env
+          echo -en "OPTION_MAX_ROLLING_BATCH_SIZE=2\nOPTION_OUTPUT_FORMATTER=jsonlines\nTENSOR_PARALLEL_DEGREE=1\nHF_MODEL_ID=gpt2\nOPTION_TASK=text-generation\nOPTION_ROLLING_BATCH=lmi-dist" > docker_env
           ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG nocode lmi
           python3 llm/client.py lmi_dist gpt2
           docker rm -f $(docker ps -aq)

diff --git a/engines/python/setup/djl_python/tests/test_test_model.py b/engines/python/setup/djl_python/tests/test_test_model.py
@@ -61,17 +61,14 @@ def test_all_code(self):
 
     def test_with_env(self):
         envs = {
-            "OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
-            "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
+            "HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
             "OPTION_ROLLING_BATCH": "auto",
             "OPTION_TGI_COMPAT": "true"
         }
         for key, value in envs.items():
             os.environ[key] = value
         huggingface.get_rolling_batch_class_from_str = override_rolling_batch
         handler = TestHandler(huggingface)
-        self.assertEqual(handler.serving_properties["model_id"],
-                         envs["OPTION_MODEL_ID"])
         self.assertEqual(handler.serving_properties["rolling_batch"],
                          envs["OPTION_ROLLING_BATCH"])
         self.assertEqual(handler.serving_properties["tgi_compat"],
@@ -100,17 +97,14 @@ def test_with_env(self):
 
     def test_with_tgi_compat_env(self):
         envs = {
-            "OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
-            "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
+            "HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
             "OPTION_ROLLING_BATCH": "auto",
             "OPTION_TGI_COMPAT": "true"
         }
         for key, value in envs.items():
             os.environ[key] = value
         huggingface.get_rolling_batch_class_from_str = override_rolling_batch
         handler = TestHandler(huggingface)
-        self.assertEqual(handler.serving_properties["model_id"],
-                         envs["OPTION_MODEL_ID"])
         self.assertEqual(handler.serving_properties["rolling_batch"],
                          envs["OPTION_ROLLING_BATCH"])
         self.assertEqual(handler.serving_properties["tgi_compat"],
@@ -161,17 +155,11 @@ def test_all_code_chat(self):
         self.assertEqual(len(result), len(inputs))
 
     def test_with_env_chat(self):
-        envs = {
-            "OPTION_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
-            "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
-            "OPTION_ROLLING_BATCH": "auto"
-        }
+        envs = {"OPTION_ROLLING_BATCH": "auto"}
         for key, value in envs.items():
             os.environ[key] = value
         huggingface.get_rolling_batch_class_from_str = override_rolling_batch
         handler = TestHandler(huggingface)
-        self.assertEqual(handler.serving_properties["model_id"],
-                         envs["OPTION_MODEL_ID"])
         self.assertEqual(handler.serving_properties["rolling_batch"],
                          envs["OPTION_ROLLING_BATCH"])
         inputs = [{
@@ -248,8 +236,7 @@ def test_exception_handling(self):
     @unittest.skip
     def test_profiling(self, logging_method):
         envs = {
-            "OPTION_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
-            "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
+            "HF_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
             "OPTION_ROLLING_BATCH": "auto",
             "DJL_PYTHON_PROFILING": "true",
             "DJL_PYTHON_PROFILING_TOP_OBJ": "60"
@@ -259,8 +246,6 @@ def test_profiling(self, logging_method):
             os.environ[key] = value
         huggingface.get_rolling_batch_class_from_str = override_rolling_batch
         handler = TestHandler(huggingface)
-        self.assertEqual(handler.serving_properties["model_id"],
-                         envs["OPTION_MODEL_ID"])
         self.assertEqual(handler.serving_properties["rolling_batch"],
                          envs["OPTION_ROLLING_BATCH"])
         inputs = [{

diff --git a/serving/docs/lmi/deployment_guide/configurations.md b/serving/docs/lmi/deployment_guide/configurations.md
@@ -125,17 +125,20 @@ You can find these configurations in the respective [user guides](../user_guides
 
 All LMI Configuration keys available in the `serving.properties` format can be specified as environment variables.
 
-The translation for `engine` is unique. The configuration `engine=<engine>` is translated to `SERVING_LOAD_MODELS=test::<engine>=/opt/ml/model`.
-For example:
+The property `option.model_id` is unique. It is translated to `HF_MODEL_ID`.
 
-* `engine=Python` is translated to environment variable `SERVING_LOAD_MODELS=test::Python=/opt/ml/model`
-* `engine=MPI` is translated to environment variable `SERVING_LOAD_MODELS=test::MPI=/opt/ml/model`
+The property `engine` is translated to `OPTION_ENGINE`.
+By default, LMI will use the Python engine. You can use `OPTION_ENGINE=Python` to explicitly set the engine.
+To use the MPI engine, you should also provide `OPTION_MPI_MODE=true`.
+In general, we recommend that you do not specify engine or mpi configurations through environment variables.
+LMI will infer the correct engine and operating mode based on `option.rolling_batch` if provided.
+If `option.rolling_batch` is not provided, LMI will infer the recommended backend and set the engine configuration accordingly.
 
 Configuration keys that start with `option.` can be specified as environment variables using the `OPTION_` prefix.
 The configuration `option.<property>` is translated to environment variable `OPTION_<PROPERTY>`. For example:
 
-* `option.model_id` is translated to environment variable `OPTION_MODEL_ID`
-* `option.tensor_parallel_degree` is translated to environment variable `OPTION_TENSOR_PARALLEL_DEGREE`
+* `option.rolling_batch` is translated to environment variable `OPTION_ROLLING_BATCH`
+
 
 Configuration keys that do not start with option can be specified as environment variables using the `SERVING_` prefix.
 The configuration `<property>` is translated to environment variable `SERVING_<PROPERTY>`. For example:

diff --git a/serving/docs/lmi/deployment_guide/deploying-your-endpoint.md b/serving/docs/lmi/deployment_guide/deploying-your-endpoint.md
@@ -176,7 +176,8 @@ The following options may be added to the `ModelDataSource` field to support unc
 This mechanism is useful when deploying SageMaker endpoints with network isolation.
 Model artifacts will be downloaded by SageMaker and mounted to the container rather than being downloaded by the container at runtime.
 
-If you use this mechanism to deploy the container, you should set `option.model_id=/opt/ml/model` in serving.properties, or `OPTION_MODEL_ID=/opt/ml/model` in environment variables depending on which configuration style you are using.  
+If you use this mechanism to deploy the container, you do not need to specify the `option.model_id` or `HF_MODEL_ID` config.
+LMI will load the model artifacts from the model directory by default, which is where SageMaker downloads and mounts the model artifacts from S3.
 
 Follow this link for a detailed overview of this option: https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-uncompressed.html
 

diff --git a/serving/docs/lmi/deployment_guide/testing-custom-script.md b/serving/docs/lmi/deployment_guide/testing-custom-script.md
@@ -48,7 +48,7 @@ from djl_python import huggingface
 from djl_python.test_model import TestHandler
 
 envs = {
-            "OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
+            "HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
             "OPTION_MPI_MODE": "true",
             "OPTION_ROLLING_BATCH": "lmi-dist",
             "OPTION_TENSOR_PARALLEL_DEGREE": 4

diff --git a/serving/docs/lmi/tutorials/trtllm_aot_tutorial.md b/serving/docs/lmi/tutorials/trtllm_aot_tutorial.md
@@ -50,7 +50,7 @@ docker pull 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.27.0-te
 These below configurations helps you configure the inference optimizations parameters. You can check all the configurations of TensorRT-LLM LMI handler  [in our docs](../user_guides/trt_llm_user_guide.md#advanced-tensorrt-llm-configurations). 
 
 ```
-OPTION_MODEL_ID={{s3url}}
+HF_MODEL_ID={{s3url}}
 OPTION_TENSOR_PARALLEL_DEGREE=8
 OPTION_MAX_ROLLING_BATCH_SIZE=128
 OPTION_DTYPE=fp16
@@ -87,7 +87,7 @@ In the below example, the model artifacts will be saved to `$MODEL_REPO_DIR` cre
 docker run --runtime=nvidia --gpus all --shm-size 12gb \
 -v $MODEL_REPO_DIR:/tmp/trtllm \
 -p 8080:8080 \
--e OPTION_MODEL_ID=$OPTION_MODEL_ID \
+-e HF_MODEL_ID=$HF_MODEL_ID \
 -e OPTION_TENSOR_PARALLEL_DEGREE=$OPTION_TENSOR_PARALLEL_DEGREE \
 -e OPTION_MAX_ROLLING_BATCH_SIZE=$OPTION_MAX_ROLLING_BATCH_SIZE \
 -e OPTION_DTYPE=$OPTION_DTYPE \
@@ -115,7 +115,7 @@ aws s3 cp $MODEL_REPO_DIR s3://YOUR_S3_FOLDER_NAME/ --recursive
 **Note:**  After uploading model artifacts to s3, you can just update the model_id(env var or in `serving.properties`) to the newly created s3 url with compiled model artifacts and use the same rest of the environment variables or `serving.properties`  when deploying on SageMaker. Here, you can check the [tutorial](https://github.com/deepjavalibrary/djl-demo/blob/master/aws/sagemaker/large-model-inference/sample-llm/trtllm_rollingbatch_deploy_llama_13b.ipynb) on how to run inference using TensorRT-LLM DLC.  Below snippet shows example updated model_id.
 
 ```
-OPTION_MODEL_ID=s3://YOUR_S3_FOLDER_NAME
+HF_MODEL_ID=s3://YOUR_S3_FOLDER_NAME
 OPTION_TENSOR_PARALLEL_DEGREE=8
 OPTION_MAX_ROLLING_BATCH_SIZE=128
 OPTION_DTYPE=fp16

diff --git a/serving/docs/lmi/tutorials/trtllm_manual_convert_tutorial.md b/serving/docs/lmi/tutorials/trtllm_manual_convert_tutorial.md
@@ -254,7 +254,7 @@ Finally, you can use one of the following configuration to load your model on Sa
 
  ### 1. Environment variables:
 ```
-OPTION_MODEL_ID=s3://lmi-llm/trtllm/0.5.0/baichuan-13b-tp2/
+HF_MODEL_ID=s3://lmi-llm/trtllm/0.5.0/baichuan-13b-tp2/
 OPTION_TENSOR_PARALLEL_DEGREE=2
 OPTION_MAX_ROLLING_BATCH_SIZE=64
 ```