-
Notifications
You must be signed in to change notification settings - Fork 92
/
vllm_benchmark.py
244 lines (201 loc) · 9.81 KB
/
vllm_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import os
import torch
from time import perf_counter
from vllm import LLM, SamplingParams
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
def create_prompt(sample):
"""
Formats a given sample into the prompt format used by the mistral-7B-instruct model.
Args:
sample (str): The input sample containing an instruction and response.
Returns:
str: The formatted prompt for mistral-7B-instruct.
"""
bos_token = "<s>"
original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
system_message = "Use the provided input to create an instruction that could have been used to generate the response with an LLM."
response = sample.replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace("\n### Response\n", "").strip()
eos_token = "</s>"
full_prompt = ""
full_prompt += bos_token
full_prompt += "### Instruction:"
full_prompt += "\n" + original_system_message
full_prompt += "\n\n### Input:"
full_prompt += "\n" + input
full_prompt += "\n\n### Response:"
full_prompt += "\n" + response
full_prompt += eos_token
return full_prompt
def download_vllm_model():
"""
Downloads a Very Large Language Model (VLLM) from the Hugging Face Model Hub.
Returns:
str: The path to the directory where the model is stored.
"""
MODEL_DIR = '/model'
os.makedirs(MODEL_DIR, exist_ok=True)
snapshot_download('mistralai/Mistral-7B-Instruct-v0.1', local_dir=MODEL_DIR, token="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")
return MODEL_DIR
def generate_vllm_outputs(instructions, model_dir):
"""
Generates outputs from a Very Large Language Model (VLLM) based on given instructions.
Args:
instructions (List[str]): A list of instruction strings to generate outputs.
model_dir (str): The path to the directory containing the VLLM model.
Returns:
List[str]: A list of generated outputs corresponding to the input instructions.
"""
sampling_params = SamplingParams(temperature=0.75,
top_p=1,
max_tokens=8000,
presence_penalty=1.15,)
llm = LLM(model=model_dir, dtype=torch.float16)
prompts = [instruction for instruction in instructions]
outputs = llm.generate(prompts, sampling_params)
return outputs
def calculate_vllm_num_of_words(outputs):
"""
Calculates the total number of words in a list of outputs generated by a Very Large Language Model (VLLM).
Args:
outputs (List[GeneratedOutput]): A list of generated outputs, where each output is an instance of the 'GeneratedOutput' class.
Returns:
int: The total number of words in all the generated outputs.
"""
num_of_words = 0
for output in outputs:
generated_text = output.outputs[0].text
num_of_words += len(generated_text.split("Generated text:")[0].split(" "))
return num_of_words
def calculate_throughput(num_of_words, total_time_taken):
"""
Calculates the throughput of a process in words per second.
Args:
num_of_words (int): The total number of words processed.
total_time_taken (float): The total time taken to process the words, in seconds.
Returns:
float: The throughput, measured in words per second.
"""
throughput = num_of_words / total_time_taken
return throughput
def prompt_latency(num_of_words, time_taken_for_a_query):
"""
Calculates the prompt latency, which is the average time taken to process each word in a query.
Args:
num_of_words (int): The total number of words in the query.
time_taken_for_a_query (float): The total time taken to process the entire query, in seconds.
Returns:
float: The prompt latency, measured in seconds per word.
"""
latency = num_of_words / time_taken_for_a_query
return latency
def load_model(model_name):
"""
Loads a pre-trained causal language model from the Hugging Face Model Hub.
Args:
model_name (str): Mistral Model
Returns:
AutoModelForCausalLM: The loaded pre-trained language model.
"""
model = AutoModelForCausalLM.from_pretrained(
model_name,
use_cache=False
)
model = model.to(dtype=torch.float16, device='cuda')
model.to("cuda")
return model
def load_tokenizer(model_name):
"""
Loads a pre-trained tokenizer associated with a language model from the Hugging Face Model Hub.
Args:
model_name (_type_): Mistral Model
Returns:
transformers.AutoTokenizer: The loaded pre-trained tokenizer.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return tokenizer
def generate_llm_response(prompt, model, tokenizer):
"""
Generates a response from a language model (LLM) based on a given prompt.
Args:
prompt (str): The input prompt for generating the response.
model (transformers.AutoModelForCausalLM): The pre-trained language model.
tokenizer (transformers.AutoTokenizer): The associated pre-trained tokenizer.
Returns:
str: The generated response from the language model.
"""
encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
model_inputs = encoded_input.to('cuda')
generated_ids = model.generate(**model_inputs, max_new_tokens=8000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
decoded_output = tokenizer.batch_decode(generated_ids)
return decoded_output[0].replace(prompt, "")
def calculate_llm_num_words(instructions, model, tokenizer):
"""
Calculates the total number of words in the generated responses from a language model (LLM)
based on a list of input instructions.
Args:
instructions (List[str]): A list of input instructions to generate responses.
model (transformers.AutoModelForCausalLM): The pre-trained language model.
tokenizer (transformers.AutoTokenizer): The associated pre-trained tokenizer.
Returns:
int: The total number of words in all the generated responses.
"""
num_of_words = 0
for instruction in instructions:
output = generate_llm_response(instruction, model, tokenizer)
num_of_words += len(output.split(" "))
return num_of_words
if __name__ == '__main__':
MODEL_DIR = download_vllm_model()
default_model_name = 'mistralai/Mistral-7B-v0.1'
instructions = [
"Elaborate on the cultural heritage of Nepal.",
"How did the Industrial Revolution impact European societies?",
"Provide a concise overview of the theory of relativity by Albert Einstein.",
"Explain the principles behind blockchain technology.",
"Who were the key figures in the Renaissance and their contributions to art and science?",
"Describe the process of photosynthesis and its significance in ecosystems.",
"What are the main features of the Great Barrier Reef and its ecological importance?",
"Explore the origins and development of jazz music in the United States.",
"Give a brief history of the internet and its transformative effects on communication.",
"What are the major causes and consequences of climate change?",
"Who was Ada Lovelace, and what role did she play in the development of computer programming?",
"Examine the impact of the Silk Road on cultural exchange between East and West.",
"What is dark matter, and why is it important in our understanding of the universe?",
"Explore the history and significance of the Rosetta Stone in deciphering ancient languages."
]
# Measure time taken for vLLM generation
start_time_vllm = perf_counter()
vllm_outputs = generate_vllm_outputs(instructions, MODEL_DIR)
end_time_vllm = perf_counter()
total_time_taken_for_generation_vllm = end_time_vllm - start_time_vllm
# Calculate time taken for a single query for vLLM
time_taken_for_a_query_vllm = total_time_taken_for_generation_vllm / len(instructions)
# Print vLLM results
vllm_num_of_words = calculate_vllm_num_of_words(vllm_outputs)
throughput_vllm = calculate_throughput(vllm_num_of_words, total_time_taken_for_generation_vllm)
vllm_prompt_latency = prompt_latency(vllm_num_of_words, time_taken_for_a_query_vllm)
print("Number of words/tokens generated by vLLM: ", vllm_num_of_words)
print("Throughput with vLLM: ", throughput_vllm)
print("Latency for a prompt with vLLM: ", vllm_prompt_latency)
print("Total time taken for vLLM generation: ", total_time_taken_for_generation_vllm)
print("Time taken for a single query with vLLM: ", time_taken_for_a_query_vllm)
# Measure time taken for LLM generation
model = load_model(default_model_name)
tokenizer = load_tokenizer(default_model_name)
start_time_llm = perf_counter()
llm_num_of_words = calculate_llm_num_words(instructions, model, tokenizer)
end_time_llm = perf_counter()
total_time_taken_for_generation_llm = end_time_llm - start_time_llm
# Calculate time taken for a single query for LLM
time_taken_for_a_query_llm = total_time_taken_for_generation_llm / len(instructions)
# Print LLM results
throughput_llm = calculate_throughput(llm_num_of_words, total_time_taken_for_generation_llm)
llm_prompt_latency = prompt_latency(llm_num_of_words, time_taken_for_a_query_llm)
print("\nNumber of words/tokens generated by LLM: ", llm_num_of_words)
print("Throughput with LLM: ", throughput_llm)
print("Latency for a prompt with LLM: ", llm_prompt_latency)
print("Total time taken for LLM generation: ", total_time_taken_for_generation_llm)
print("Time taken for a single query with LLM: ", time_taken_for_a_query_llm)