From 0328be0882763637200ea3a06e38a9e83fbdba26 Mon Sep 17 00:00:00 2001 From: Siddharth Venkatesan Date: Sun, 11 Dec 2022 15:06:27 -0800 Subject: [PATCH] Fix fp32 issues with DS fork wheel for stable diffusion, fix llm tests --- .github/workflows/llm_integration.yml | 11 +++++++- .../setup/djl_python/stable-diffusion.py | 28 +++++++++++++------ tests/integration/llm/client.py | 23 +++++++++++---- tests/integration/llm/prepare.py | 5 ++++ 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index 00a9e2f06..06aa37da1 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -162,7 +162,7 @@ jobs: with: python-version: '3.10.x' - name: Install pip dependencies - run: pip3 install requests + run: pip3 install requests pillow - name: Build container name run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - name: Download models and dockers @@ -178,6 +178,15 @@ jobs: serve python3 llm/client.py stable-diffusion stable-diffusion-v1-4 docker rm -f $(docker ps -aq) + - name: Test stable-diffusion-v1-5 + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py stable-diffusion stable-diffusion-v1-5 + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ + serve + python3 llm/client.py stable-diffusion stable-diffusion-v1-5 + docker rm -f $(docker ps -aq) - name: Test bloom-7b working-directory: tests/integration run: | diff --git a/engines/python/setup/djl_python/stable-diffusion.py b/engines/python/setup/djl_python/stable-diffusion.py index 506934b8a..ff8550708 100644 --- a/engines/python/setup/djl_python/stable-diffusion.py +++ b/engines/python/setup/djl_python/stable-diffusion.py @@ -56,14 +56,7 @@ def initialize(self, properties: dict): self.device = int(os.getenv("LOCAL_RANK", "0")) self.tensor_parallel_degree = int( properties.get("tensor_parallel_degree", 1)) - self.ds_config = { - "replace_with_kernel_inject": True, - # TODO: Figure out why cuda graph doesn't work for stable diffusion via DS - "enable_cuda_graph": False, - "replace_method": "auto", - "dtype": self.data_type, - "mp_size": self.tensor_parallel_degree - } + self.ds_config = self._get_ds_config_for_dtype(self.data_type) if not self.model_id: config_file = os.path.join(self.model_dir, "model_index.json") @@ -91,6 +84,25 @@ def initialize(self, properties: dict): self.pipeline = pipeline self.initialized = True + def _get_ds_config_for_dtype(self, dtype): + # This is a workaround due to 2 issues with DeepSpeed 0.7.5 + # 1. No kernel injection is available for stable diffusion using fp32 (kernels only written for fp16) + # 2. Changes in our bf16 fork raise an error, but the original deepspeed codebase defaults to fp16 + # when dtype is not set explicitly. We need to be explicit here with this config + ds_config = { + # TODO: Figure out why cuda graph doesn't work for stable diffusion via DS + "enable_cuda_graph": False, + "dtype": dtype, + "mp_size": self.tensor_parallel_degree + } + if dtype == torch.float16: + ds_config["replace_with_kernel_inject"] = True + ds_config["replace_method"] = "auto" + else: + ds_config["replace_with_kernel_inject"] = False + ds_config["replace_method"] = None + return ds_config + def inference(self, inputs: Input): try: content_type = inputs.get_property("Content-Type") diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 2e8c7e77c..6b32e94b2 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -3,6 +3,8 @@ import subprocess as sp import logging import math +from PIL import Image +from io import BytesIO logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Build the LLM configs') @@ -79,6 +81,11 @@ "steps": [1, 2], "worker": 2 }, + "stable-diffusion-v1-5": { + "max_memory_per_gpu": 16.0, + "size": [256, 512], + "steps": [1, 2] + }, } @@ -97,7 +104,7 @@ def check_worker_number(desired): def send_json(data): headers = {'content-type': 'application/json'} res = requests.post(endpoint, headers=headers, json=data) - return res.json(), res.status_code + return res def get_gpu_memory(): @@ -138,7 +145,8 @@ def test_handler(model, model_spec): params = {"max_length": seq_length} req["parameters"] = params logging.info(f"req {req}") - res, _ = send_json(req) + res = send_json(req) + res = res.json() logging.info(f"res {res}") result = [item[0]['generated_text'] for item in res] assert len(result) == batch_size @@ -162,7 +170,8 @@ def test_ds_raw_model(model): "use_pipeline": spec["use_pipeline"] } logging.info(f"req: {req}") - res, _ = send_json(req) + res = send_json(req) + res = res.json() logging.info(f"res: {res}") assert len(res["outputs"]) == batch_size memory_usage = get_gpu_memory() @@ -185,8 +194,12 @@ def test_sd_handler(model, model_spec): params = {"height": size, "width": size, "steps": step} req["parameters"] = params logging.info(f"req: {req}") - res, status_code = send_json(req) - assert status_code == 200 + res = send_json(req) + assert res.status_code == 200 + try: + img = Image.open(BytesIO(res.content)).convert("RGB") + except Exception as e: + raise IOError("failed to deserialize image from response", e) memory_usage = get_gpu_memory() logging.info(memory_usage) for memory in memory_usage: diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 90ac0d147..1ed12a955 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -69,6 +69,11 @@ "option.tensor_parallel_degree": 2, "option.dtype": "fp16" }, + "stable-diffusion-v1-5": { + "option.s3url": "s3://djl-llm/stable-diffusion-v1-5/", + "option.tensor_parallel_degree": 4, + "option.dtype": "fp32" + }, }