Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python backend support #86

Merged
merged 8 commits into from Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Dockerfile
@@ -0,0 +1,5 @@
FROM moyix/triton_with_ft:22.09

# Install dependencies: torch
RUN python3 -m pip install --disable-pip-version-check -U torch --extra-index-url https://download.pytorch.org/whl/cu116
RUN python3 -m pip install --disable-pip-version-check -U transformers bitsandbytes accelerate
20 changes: 12 additions & 8 deletions copilot_proxy/utils/codegen.py
Expand Up @@ -73,13 +73,17 @@ def to_word_list_format(word_dict, tokenizer):
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))

def generate(self, data):
model_name = "fastertransformer"
prompt = data['prompt']
n = data.get('n', 1)
model_name = data["model"]
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
np_type = np.int32 if model_name.startswith("py-") else np.uint32

input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
prompt_len = input_start_ids.shape[1]
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
max_tokens = data.get('max_tokens', 16)
prompt_tokens: int = input_len[0][0]
requested_tokens = max_tokens + prompt_tokens
Expand All @@ -90,7 +94,7 @@ def generate(self, data):
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
f"Please reduce your prompt; or completion length."
)
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
num_logprobs = data.get('logprobs', -1)
if num_logprobs is None:
num_logprobs = 1
Expand All @@ -105,17 +109,17 @@ def generate(self, data):

top_p = data.get('top_p', 1.0)
frequency_penalty = data.get('frequency_penalty', 1.0)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)

stop_words = data.get('stop', [])
if stop_words is None:
Expand Down
5 changes: 4 additions & 1 deletion docker-compose.yaml
@@ -1,11 +1,14 @@
version: '3.3'
services:
triton:
image: moyix/triton_with_ft:22.09
build:
context: .
dockerfile: Dockerfile
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in my other comment, I think the dependencies should be added in moyix/triton_with_ft

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. I can make a PR to that repo instead

command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
shm_size: '2gb'
volumes:
- ${MODEL_DIR}:/model
- ${HF_CACHE_DIR}:/root/.cache/huggingface
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If no HF_CACHE_DIR is set this breaks the deployment. Maybe we should default to true?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current version sets HF_CACHE_DIR to /tmp/hf_cache. The reason I didn't set it to default to true was because docker messes up the file permissions by setting many of them to root (unless rootless docker is used). So, the cache is shared only if the user knows what they're doing.

I could default it to true, and warn the user about the permission issues. Not sure which is a good option.

@fdegier, @moyix thoughts?

Copy link
Collaborator

@fdegier fdegier Oct 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thakkarparth007 if a user does not use the cache, the volume is set to empty, which can not be mounted and causes the docker compose up to fail, hence why I suggested to default to true but I understand the permission issues you mentioned.

I think adding something like HF_DATASETS_CACHE="fauxpilot/.hf-cache" and removing the option to cache, will always cache and not mess up permissions? Cache path needs to verified, consider it just an example.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, I just saw your comment @fdegier

So currently if you notice in the setup.sh (https://github.com/moyix/fauxpilot/pull/86/files#diff-4209d788ad32c40cbda3c66b3de47eefb929308ca703bb77a6382625986add17R148) then you'll see that HF_CACHE_DIR is being set to /tmp/hf_cache if the user doesn't want to share hf_cache.

But yes, perhaps it'll be better to store the cache in the fauxpilot directory itself. Updated it!

ports:
- "8000:8000"
- "8001:8001"
Expand Down
180 changes: 180 additions & 0 deletions python_backend/config_template.pbtxt
@@ -0,0 +1,180 @@
name: "py-model"
backend: "python"
max_batch_size: 4
input [
{
name: "input_ids"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
# UNUSED
name: "start_id"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "end_id"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "input_lengths"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "random_seed"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "beam_width"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
# UNUSED
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_INT32
dims: [ -1, -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_INT32
dims: [ -1, -1 ]
} #,
# Following is currently unsupported, but should be supported in the future
# {
# name: "cum_log_probs"
# data_type: TYPE_FP32
# dims: [ -1 ]
# },
# {
# name: "output_log_probs"
# data_type: TYPE_FP32
# dims: [ -1, -1 ]
# }
]
# unsure what this is for
instance_group [
{
count: 1
kind: KIND_CPU
}
]
parameters {
key: "use_half"
value: {
string_value: "1"
}
}
parameters {
key: "model_name"
value: {
string_value: "${model_name}" # e.g. "codegen-350M-multi"
}
}
parameters {
key: "org_name"
value: {
string_value: "${org_name}" # e.g. "Salesforce"
}
}
parameters {
key: "use_int8",
value: {
string_value: "${use_int8}" # e.g. "0" or "1"
}
}
parameters {
key: "use_auto_device_map",
value: {
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
}
}
44 changes: 44 additions & 0 deletions python_backend/init_model.py
@@ -0,0 +1,44 @@
"""
A simple script that sets up the model directory of a given model for Triton.
"""

import argparse
import os
import shutil
from pathlib import Path
from string import Template

SCRIPT_DIR = Path(__file__).parent
CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')

parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True)
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--org_name", type=str, required=True)
parser.add_argument("--use_half", type=str, default="1")
parser.add_argument("--use_int8", type=str, default="0")
parser.add_argument("--use_auto_device_map", type=str, default="1")
args = parser.parse_args()


# Step1: Make model directory
model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.org_name}-{args.model_name}/py-model/1"))
model_dir_path.mkdir(parents=True, exist_ok=True)

# Step 2: copy model.py
shutil.copy(os.path.join(SCRIPT_DIR, 'model.py'), os.path.join(model_dir_path, 'model.py'))

# Step 3: Generate config.pbtxt
with open(CONFIG_TEMPLATE_PATH, 'r') as f:
template = Template(f.read())

config = template.substitute(
org_name=args.org_name,
model_name=args.model_name,
use_half=args.use_half,
use_int8=args.use_int8,
use_auto_device_map=args.use_auto_device_map,
)
with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
f.write(config)
print(f"Config written to {os.path.abspath(f.name)}")