fauxpilot · moyix · Nov 23, 2022 · Oct 17, 2022 · Oct 20, 2022 · Oct 20, 2022
@@ -0,0 +1,5 @@
+FROM moyix/triton_with_ft:22.09
+
+# Install dependencies: torch
+RUN python3 -m pip install --disable-pip-version-check -U torch --extra-index-url https://download.pytorch.org/whl/cu116
+RUN python3 -m pip install --disable-pip-version-check -U transformers bitsandbytes accelerate
@@ -73,13 +73,17 @@ def to_word_list_format(word_dict, tokenizer):
         return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
 
     def generate(self, data):
-        model_name = "fastertransformer"
         prompt = data['prompt']
         n = data.get('n', 1)
+        model_name = data["model"]
+        # ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
+        # i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
+        np_type = np.int32 if model_name.startswith("py-") else np.uint32
+
         input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
-        input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
+        input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
         prompt_len = input_start_ids.shape[1]
-        input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
         max_tokens = data.get('max_tokens', 16)
         prompt_tokens: int = input_len[0][0]
         requested_tokens = max_tokens + prompt_tokens
@@ -90,7 +94,7 @@ def generate(self, data):
                 f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
                 f"Please reduce your prompt; or completion length."
             )
-        output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
+        output_len = np.ones_like(input_len).astype(np_type) * max_tokens
         num_logprobs = data.get('logprobs', -1)
         if num_logprobs is None:
             num_logprobs = 1
@@ -105,17 +109,17 @@ def generate(self, data):
 
         top_p = data.get('top_p', 1.0)
         frequency_penalty = data.get('frequency_penalty', 1.0)
-        runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
         runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
         beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
         random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
         temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
         len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
         repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
         is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
-        beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
-        start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
-        end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
+        start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
+        end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
 
         stop_words = data.get('stop', [])
         if stop_words is None:

@@ -1,11 +1,14 @@
 version: '3.3'
 services:
   triton:
-    image: moyix/triton_with_ft:22.09
+    build:
+      context: .
+      dockerfile: Dockerfile
     command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
     shm_size: '2gb'
     volumes:
       - ${MODEL_DIR}:/model
+      - ${HF_CACHE_DIR}:/root/.cache/huggingface
     ports:
       - "8000:8000"
       - "8001:8001"

@@ -0,0 +1,180 @@
+name: "py-model"
+backend: "python"
+max_batch_size: 4
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    # UNUSED
+    name: "start_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "random_seed"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  } #,
+# Following is currently unsupported, but should be supported in the future
+#  {
+#    name: "cum_log_probs"
+#    data_type: TYPE_FP32
+#    dims: [ -1 ]
+#  },
+#  {
+#    name: "output_log_probs"
+#    data_type: TYPE_FP32
+#    dims: [ -1, -1 ]
+#  }
+]
+# unsure what this is for
+instance_group [
+  {
+    count: 1
+    kind: KIND_CPU
+  }
+]
+parameters {
+  key: "use_half"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "model_name"
+  value: {
+    string_value: "${model_name}" # e.g. "codegen-350M-multi"
+  }
+}
+parameters {
+  key: "org_name"
+  value: {
+    string_value: "${org_name}" # e.g. "Salesforce"
+  }
+}
+parameters {
+  key: "use_int8",
+  value: {
+    string_value: "${use_int8}" # e.g. "0" or "1"
+  }
+}
+parameters {
+  key: "use_auto_device_map",
+  value: {
+    string_value: "${use_auto_device_map}" # e.g. "0" or "1"
+  }
+}
@@ -0,0 +1,44 @@
+"""
+A simple script that sets up the model directory of a given model for Triton.
+"""
+
+import argparse
+import os
+import shutil
+from pathlib import Path
+from string import Template
+
+SCRIPT_DIR = Path(__file__).parent
+CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True)
+parser.add_argument("--model_name", type=str, required=True)
+parser.add_argument("--org_name", type=str, required=True)
+parser.add_argument("--use_half", type=str, default="1")
+parser.add_argument("--use_int8", type=str, default="0")
+parser.add_argument("--use_auto_device_map", type=str, default="1")
+args = parser.parse_args()
+
+
+# Step1: Make model directory
+model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.org_name}-{args.model_name}/py-model/1"))
+model_dir_path.mkdir(parents=True, exist_ok=True)
+
+# Step 2: copy model.py
+shutil.copy(os.path.join(SCRIPT_DIR, 'model.py'), os.path.join(model_dir_path, 'model.py'))
+
+# Step 3: Generate config.pbtxt
+with open(CONFIG_TEMPLATE_PATH, 'r') as f:
+    template = Template(f.read())
+
+config = template.substitute(
+    org_name=args.org_name,
+    model_name=args.model_name,
+    use_half=args.use_half,
+    use_int8=args.use_int8,
+    use_auto_device_map=args.use_auto_device_map,
+)
+with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
+    f.write(config)
+    print(f"Config written to {os.path.abspath(f.name)}")