Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
dtype=torch.float,
injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')},
replace_with_kernel_inject=False)
model.to(f'cuda:{local_rank}')
model.to(f'cuda:{local_rank}')
def map_to_array(batch):
speech, _ = sf.read(batch["file"])
batch["speech"] = speech
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def main():
required=False,
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)

parser.add_argument("--prompt", type=str, default="")
parser.add_argument("--length", type=int, default=20)
parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
Expand All @@ -214,7 +214,7 @@ def main():
parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")

parser.add_argument("--local_rank", type=int, default=0, help="local rank")
parser.add_argument("--local_rank", type=int, default=0, help="local rank")
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
Expand All @@ -235,7 +235,7 @@ def main():
args.n_gpu,
args.fp16,
)

set_seed(args)

# Initialize the model and tokenizer
Expand All @@ -256,9 +256,9 @@ def main():
if args.ds_inference:
import deepspeed.module_inject as module_inject
import deepspeed
injection_policy={gpt2_transformer:
injection_policy={gpt2_transformer:
module_inject.replace_policy.HFGPT2LayerPolicy}
model = deepspeed.init_inference(model,
model = deepspeed.init_inference(model,
mp_size=1,
dtype=(torch.half if args.fp16 else torch.float),
injection_policy=injection_policy,
Expand Down Expand Up @@ -293,7 +293,7 @@ def main():
prefix = args.prefix if args.prefix else args.padding_text
for ppt in prompt_text:
eprompt.append(tokenizer.encode(prefix + ppt, add_special_tokens=False, return_tensors="pt"))

latencies = []
for encoded_prompt, ppt in zip(eprompt, prompt_text):
encoded_prompt = encoded_prompt.to(args.device)
Expand All @@ -302,10 +302,10 @@ def main():
input_ids = None
else:
input_ids = encoded_prompt

torch.cuda.synchronize()
t0 = time.time()

output_sequences = model.generate(
input_ids=input_ids,
max_length=args.length + len(encoded_prompt[0]),
Expand Down
31 changes: 31 additions & 0 deletions inference/huggingface/text-generation/test-bloom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import torch
import deepspeed
import transformers

# Pipeline class to mimic HF pipeline
from utils import Pipeline

model_name = 'bigscience/bloom-3b'
dtype = torch.float16
num_tokens = 100

# Get local gpu rank from torch.distributed/deepspeed launcher
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

pipe = Pipeline(model_name=model_name,
dtype=dtype
)

pipe.model = deepspeed.init_inference(
pipe.model,
mp_size=world_size,
dtype=dtype,
replace_with_kernel_inject=True,
base_dir=pipe.repo_root,
checkpoint=pipe.checkpoints_json
)

output = pipe('DeepSpeed is', num_tokens=num_tokens, do_sample=False)
print(output)
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/test-gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@
mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)

string = generator("DeepSpeed is", min_length=50, max_length=50, do_sample=True, use_cache=True)
print(string)
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/test-gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@
mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)

string = generator("DeepSpeed is", min_length=50, max_length=50, do_sample=True, use_cache=True)
print(string)
77 changes: 77 additions & 0 deletions inference/huggingface/text-generation/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
'''
Helper classes and functions for examples
'''

import io
from pathlib import Path
import json
import deepspeed
import torch
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

class Pipeline():
'''Example helper class, meant to mimic HF pipelines'''
def __init__(self,
model_name='bigscience/bloom-3b',
dtype=torch.float16,
is_meta=True
):
self.model_name = model_name
self.dtype = dtype
self.tokenizer = AutoTokenizer.from_pretrained(model_name)

if (is_meta):
'''When meta tensors enabled, use checkpoints'''
self.config = AutoConfig.from_pretrained(self.model_name)
self.repo_root, self.checkpoints_json = self.generate_json()

with deepspeed.OnDevice(dtype=self.dtype, device="meta"):
self.model = AutoModelForCausalLM.from_config(self.config, torch_dtype=self.dtype)
else:
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)

self.model.eval()


def __call__(self,
inputs=["test"],
num_tokens=100,
do_sample=False):
if isinstance(inputs, str):
input_list = [inputs]
else:
input_list = inputs

outputs = self.generate_outputs(input_list, num_tokens=num_tokens, do_sample=do_sample)
return outputs


def generate_json(self):
repo_root = snapshot_download(self.model_name, allow_patterns=["*"], local_files_only=False, revision=None)

checkpoints_json = "checkpoints.json"

with io.open(checkpoints_json, "w", encoding="utf-8") as f:
file_list = [str(entry) for entry in Path(repo_root).rglob("*.[bp][it][n]") if entry.is_file()]
data = {"type": self.config.model_type, "checkpoints": file_list, "version": 1.0}
json.dump(data, f)

return repo_root, checkpoints_json


def generate_outputs(self,
inputs=["test"],
num_tokens=100,
do_sample=False):
generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=do_sample)

input_tokens = self.tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())

outputs = self.model.generate(**input_tokens, **generate_kwargs)
outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

return outputs