From a6f8916133b7ea2b712f032d95812d9efbf85bf7 Mon Sep 17 00:00:00 2001 From: Gnurro Date: Tue, 14 Nov 2023 18:41:26 +0100 Subject: [PATCH 1/6] Replace usage of buggy tokenizer method argument with decoding of prompt tokens for proper model output culling --- backends/huggingface_local_api.py | 4 ++-- backends/llama2_hf_local_api.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py index 9a8474e255..731357ae16 100644 --- a/backends/huggingface_local_api.py +++ b/backends/huggingface_local_api.py @@ -163,7 +163,7 @@ def generate_response(self, messages: List[Dict], model: str, prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") prompt_tokens = prompt_tokens.to(self.device) - prompt_text = self.tokenizer.apply_chat_template(messages, tokenize=False) + prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0] prompt = {"inputs": prompt_text, "max_new_tokens": max_new_tokens, "temperature": self.temperature, "return_full_text": return_full_text} @@ -189,7 +189,7 @@ def generate_response(self, messages: List[Dict], model: str, do_sample=do_sample ) - model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True)[0] + model_output = self.tokenizer.batch_decode(model_output_ids)[0] # cull input context; equivalent to transformers.pipeline method: if not return_full_text: diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index 5a42e4b69d..aa151a8d26 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -100,7 +100,7 @@ def generate_response(self, messages: List[Dict], model: str, prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") prompt_tokens = prompt_tokens.to(self.device) # apply chat template for records: - prompt_text = self.tokenizer.apply_chat_template(messages, tokenize=False) + prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0] prompt = {"inputs": prompt_text, "max_new_tokens": max_new_tokens, "temperature": self.temperature} @@ -119,8 +119,7 @@ def generate_response(self, messages: List[Dict], model: str, max_new_tokens=max_new_tokens ) - model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True, - clean_up_tokenization_spaces=False)[0] + model_output = self.tokenizer.batch_decode(model_output_ids)[0] response = { "role": "assistant", From 79578f0b3158293314966381f33a00b647556086 Mon Sep 17 00:00:00 2001 From: Gnurro Date: Tue, 14 Nov 2023 18:48:24 +0100 Subject: [PATCH 2/6] Applying decoding changes for non-chat as well --- backends/llama2_hf_local_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index aa151a8d26..304f4c31b9 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -152,8 +152,7 @@ def generate_response(self, messages: List[Dict], model: str, max_new_tokens=max_new_tokens ) - model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True, - clean_up_tokenization_spaces=False)[0] + model_output = self.tokenizer.batch_decode(model_output_ids)[0] response_text = model_output.replace(prompt, '').strip() From c559726a1ccf643a75da58a56f8be95ed0d4cbb4 Mon Sep 17 00:00:00 2001 From: Gnurro Date: Wed, 15 Nov 2023 13:44:18 +0100 Subject: [PATCH 3/6] Added flattening of assistant/assistant message pairs --- backends/huggingface_local_api.py | 3 +++ backends/llama2_hf_local_api.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py index 731357ae16..ced0ad0760 100644 --- a/backends/huggingface_local_api.py +++ b/backends/huggingface_local_api.py @@ -158,6 +158,9 @@ def generate_response(self, messages: List[Dict], model: str, if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user": messages[msg_idx - 1]['content'] += f" {message['content']}" del messages[msg_idx] + elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant": + messages[msg_idx - 1]['content'] += f" {message['content']}" + del messages[msg_idx] # apply chat template & tokenize: prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index 304f4c31b9..5a39b97cae 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -95,6 +95,9 @@ def generate_response(self, messages: List[Dict], model: str, if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user": messages[msg_idx - 1]['content'] += f" {message['content']}" del messages[msg_idx] + elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant": + messages[msg_idx - 1]['content'] += f" {message['content']}" + del messages[msg_idx] # apply chat template & tokenize prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") From 016f2c13a50d126e5a2d92c31e47532354ce0038 Mon Sep 17 00:00:00 2001 From: Gnurro Date: Wed, 15 Nov 2023 13:54:55 +0100 Subject: [PATCH 4/6] Add removal of llama2 EOS token at the end of model outputs --- backends/huggingface_local_api.py | 5 ++++- backends/llama2_hf_local_api.py | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py index ced0ad0760..c9b1a771fa 100644 --- a/backends/huggingface_local_api.py +++ b/backends/huggingface_local_api.py @@ -197,10 +197,13 @@ def generate_response(self, messages: List[Dict], model: str, # cull input context; equivalent to transformers.pipeline method: if not return_full_text: response_text = model_output.replace(prompt_text, '').strip() + # remove llama2 EOS token at the end of output: + if response_text[-4:len(response_text)] == "": + response_text = response_text[:-4] else: response_text = model_output.strip() - response = {'response': model_output} + response = {'response': response_text} return prompt, response, response_text def supports(self, model_name: str): diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index 5a39b97cae..d0809970f4 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -123,13 +123,18 @@ def generate_response(self, messages: List[Dict], model: str, ) model_output = self.tokenizer.batch_decode(model_output_ids)[0] + # cull prompt from output: + model_output = model_output.replace(prompt_text, "").strip() + # remove EOS token at the end of output: + if model_output[-4:len(model_output)] == "": + model_output = model_output[:-4] response = { "role": "assistant", - "content": model_output.replace(prompt_text, ''), + "content": model_output, } - response_text = model_output.replace(prompt_text, '').strip() + response_text = model_output else: # default (text completion) prompt = "\n".join([message["content"] for message in messages]) From bdbc71a4ea788a5bfb27b4a4fa586e9970cabb0e Mon Sep 17 00:00:00 2001 From: Gnurro Date: Thu, 16 Nov 2023 23:36:14 +0100 Subject: [PATCH 5/6] Add deepcopy of input messages to prevent reference issues --- backends/huggingface_local_api.py | 20 ++++++++++++-------- backends/llama2_hf_local_api.py | 22 +++++++++++++--------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py index c9b1a771fa..b00c4ab9bb 100644 --- a/backends/huggingface_local_api.py +++ b/backends/huggingface_local_api.py @@ -6,6 +6,7 @@ import transformers from transformers import AutoTokenizer, AutoModelForCausalLM import os +import copy logger = backends.get_logger(__name__) @@ -153,17 +154,20 @@ def generate_response(self, messages: List[Dict], model: str, logger.info(f"Finished loading huggingface model: {model}") logger.info(f"Model device map: {self.model.hf_device_map}") + # deepcopy messages to prevent reference issues: + current_messages = copy.deepcopy(messages) + # flatten consecutive user messages: - for msg_idx, message in enumerate(messages): - if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user": - messages[msg_idx - 1]['content'] += f" {message['content']}" - del messages[msg_idx] - elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant": - messages[msg_idx - 1]['content'] += f" {message['content']}" - del messages[msg_idx] + for msg_idx, message in enumerate(current_messages): + if msg_idx > 0 and message['role'] == "user" and current_messages[msg_idx - 1]['role'] == "user": + current_messages[msg_idx - 1]['content'] += f" {message['content']}" + del current_messages[msg_idx] + elif msg_idx > 0 and message['role'] == "assistant" and current_messages[msg_idx - 1]['role'] == "assistant": + current_messages[msg_idx - 1]['content'] += f" {message['content']}" + del current_messages[msg_idx] # apply chat template & tokenize: - prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") + prompt_tokens = self.tokenizer.apply_chat_template(current_messages, return_tensors="pt") prompt_tokens = prompt_tokens.to(self.device) prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0] diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index d0809970f4..280082b984 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -6,6 +6,7 @@ import transformers from transformers import AutoTokenizer, AutoModelForCausalLM import os +import copy logger = backends.get_logger(__name__) @@ -89,18 +90,21 @@ def generate_response(self, messages: List[Dict], model: str, # turn off redundant transformers warnings: transformers.logging.set_verbosity_error() + # deepcopy messages to prevent reference issues: + current_messages = copy.deepcopy(messages) + if model in self.chat_models: # chat completion # flatten consecutive user messages: - for msg_idx, message in enumerate(messages): - if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user": - messages[msg_idx - 1]['content'] += f" {message['content']}" - del messages[msg_idx] - elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant": - messages[msg_idx - 1]['content'] += f" {message['content']}" - del messages[msg_idx] + for msg_idx, message in enumerate(current_messages): + if msg_idx > 0 and message['role'] == "user" and current_messages[msg_idx - 1]['role'] == "user": + current_messages[msg_idx - 1]['content'] += f" {message['content']}" + del current_messages[msg_idx] + elif msg_idx > 0 and message['role'] == "assistant" and current_messages[msg_idx - 1]['role'] == "assistant": + current_messages[msg_idx - 1]['content'] += f" {message['content']}" + del current_messages[msg_idx] # apply chat template & tokenize - prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt") + prompt_tokens = self.tokenizer.apply_chat_template(current_messages, return_tensors="pt") prompt_tokens = prompt_tokens.to(self.device) # apply chat template for records: prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0] @@ -137,7 +141,7 @@ def generate_response(self, messages: List[Dict], model: str, response_text = model_output else: # default (text completion) - prompt = "\n".join([message["content"] for message in messages]) + prompt = "\n".join([message["content"] for message in current_messages]) prompt_tokens = self.tokenizer.encode( prompt, From a9cd314563f34725029fc21471e611151db0d87c Mon Sep 17 00:00:00 2001 From: Gnurro Date: Tue, 21 Nov 2023 13:33:21 +0100 Subject: [PATCH 6/6] Change returned response dict to proper format containing complete model output --- backends/huggingface_local_api.py | 3 ++- backends/llama2_hf_local_api.py | 14 ++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py index b00c4ab9bb..7c51b0d36e 100644 --- a/backends/huggingface_local_api.py +++ b/backends/huggingface_local_api.py @@ -198,6 +198,8 @@ def generate_response(self, messages: List[Dict], model: str, model_output = self.tokenizer.batch_decode(model_output_ids)[0] + response = {'response': model_output} + # cull input context; equivalent to transformers.pipeline method: if not return_full_text: response_text = model_output.replace(prompt_text, '').strip() @@ -207,7 +209,6 @@ def generate_response(self, messages: List[Dict], model: str, else: response_text = model_output.strip() - response = {'response': response_text} return prompt, response, response_text def supports(self, model_name: str): diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py index 280082b984..0c164b2c3d 100644 --- a/backends/llama2_hf_local_api.py +++ b/backends/llama2_hf_local_api.py @@ -127,18 +127,16 @@ def generate_response(self, messages: List[Dict], model: str, ) model_output = self.tokenizer.batch_decode(model_output_ids)[0] + + response = {"response": model_output} + # cull prompt from output: - model_output = model_output.replace(prompt_text, "").strip() + response_text = model_output.replace(prompt_text, "").strip() # remove EOS token at the end of output: - if model_output[-4:len(model_output)] == "": - model_output = model_output[:-4] + if response_text[-4:len(response_text)] == "": + response_text = response_text[:-4] - response = { - "role": "assistant", - "content": model_output, - } - response_text = model_output else: # default (text completion) prompt = "\n".join([message["content"] for message in current_messages])