From a6f8916133b7ea2b712f032d95812d9efbf85bf7 Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Tue, 14 Nov 2023 18:41:26 +0100
Subject: [PATCH 1/6] Replace usage of buggy tokenizer method argument with
 decoding of prompt tokens for proper model output culling

---
 backends/huggingface_local_api.py | 4 ++--
 backends/llama2_hf_local_api.py   | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py
index 9a8474e255..731357ae16 100644
--- a/backends/huggingface_local_api.py
+++ b/backends/huggingface_local_api.py
@@ -163,7 +163,7 @@ def generate_response(self, messages: List[Dict], model: str,
         prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
         prompt_tokens = prompt_tokens.to(self.device)
 
-        prompt_text = self.tokenizer.apply_chat_template(messages, tokenize=False)
+        prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0]
         prompt = {"inputs": prompt_text, "max_new_tokens": max_new_tokens,
                   "temperature": self.temperature, "return_full_text": return_full_text}
 
@@ -189,7 +189,7 @@ def generate_response(self, messages: List[Dict], model: str,
                 do_sample=do_sample
             )
 
-        model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True)[0]
+        model_output = self.tokenizer.batch_decode(model_output_ids)[0]
 
         # cull input context; equivalent to transformers.pipeline method:
         if not return_full_text:
diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index 5a42e4b69d..aa151a8d26 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -100,7 +100,7 @@ def generate_response(self, messages: List[Dict], model: str,
             prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
             prompt_tokens = prompt_tokens.to(self.device)
             # apply chat template for records:
-            prompt_text = self.tokenizer.apply_chat_template(messages, tokenize=False)
+            prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0]
             prompt = {"inputs": prompt_text, "max_new_tokens": max_new_tokens,
                       "temperature": self.temperature}
 
@@ -119,8 +119,7 @@ def generate_response(self, messages: List[Dict], model: str,
                     max_new_tokens=max_new_tokens
                 )
 
-            model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True,
-                                                       clean_up_tokenization_spaces=False)[0]
+            model_output = self.tokenizer.batch_decode(model_output_ids)[0]
 
             response = {
                 "role": "assistant",

From 79578f0b3158293314966381f33a00b647556086 Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Tue, 14 Nov 2023 18:48:24 +0100
Subject: [PATCH 2/6] Applying decoding changes for non-chat as well

---
 backends/llama2_hf_local_api.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index aa151a8d26..304f4c31b9 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -152,8 +152,7 @@ def generate_response(self, messages: List[Dict], model: str,
                     max_new_tokens=max_new_tokens
                 )
 
-            model_output = self.tokenizer.batch_decode(model_output_ids, skip_special_tokens=True,
-                                                       clean_up_tokenization_spaces=False)[0]
+            model_output = self.tokenizer.batch_decode(model_output_ids)[0]
 
             response_text = model_output.replace(prompt, '').strip()
 

From c559726a1ccf643a75da58a56f8be95ed0d4cbb4 Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Wed, 15 Nov 2023 13:44:18 +0100
Subject: [PATCH 3/6] Added flattening of assistant/assistant message pairs

---
 backends/huggingface_local_api.py | 3 +++
 backends/llama2_hf_local_api.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py
index 731357ae16..ced0ad0760 100644
--- a/backends/huggingface_local_api.py
+++ b/backends/huggingface_local_api.py
@@ -158,6 +158,9 @@ def generate_response(self, messages: List[Dict], model: str,
             if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user":
                 messages[msg_idx - 1]['content'] += f" {message['content']}"
                 del messages[msg_idx]
+            elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant":
+                messages[msg_idx - 1]['content'] += f" {message['content']}"
+                del messages[msg_idx]
 
         # apply chat template & tokenize:
         prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index 304f4c31b9..5a39b97cae 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -95,6 +95,9 @@ def generate_response(self, messages: List[Dict], model: str,
                 if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user":
                     messages[msg_idx - 1]['content'] += f" {message['content']}"
                     del messages[msg_idx]
+                elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant":
+                    messages[msg_idx - 1]['content'] += f" {message['content']}"
+                    del messages[msg_idx]
 
             # apply chat template & tokenize
             prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")

From 016f2c13a50d126e5a2d92c31e47532354ce0038 Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Wed, 15 Nov 2023 13:54:55 +0100
Subject: [PATCH 4/6] Add removal of llama2 EOS token </s> at the end of model
 outputs

---
 backends/huggingface_local_api.py | 5 ++++-
 backends/llama2_hf_local_api.py   | 9 +++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py
index ced0ad0760..c9b1a771fa 100644
--- a/backends/huggingface_local_api.py
+++ b/backends/huggingface_local_api.py
@@ -197,10 +197,13 @@ def generate_response(self, messages: List[Dict], model: str,
         # cull input context; equivalent to transformers.pipeline method:
         if not return_full_text:
             response_text = model_output.replace(prompt_text, '').strip()
+            # remove llama2 EOS token at the end of output:
+            if response_text[-4:len(response_text)] == "</s>":
+                response_text = response_text[:-4]
         else:
             response_text = model_output.strip()
 
-        response = {'response': model_output}
+        response = {'response': response_text}
         return prompt, response, response_text
 
     def supports(self, model_name: str):
diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index 5a39b97cae..d0809970f4 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -123,13 +123,18 @@ def generate_response(self, messages: List[Dict], model: str,
                 )
 
             model_output = self.tokenizer.batch_decode(model_output_ids)[0]
+            # cull prompt from output:
+            model_output = model_output.replace(prompt_text, "").strip()
+            # remove EOS token at the end of output:
+            if model_output[-4:len(model_output)] == "</s>":
+                model_output = model_output[:-4]
 
             response = {
                 "role": "assistant",
-                "content": model_output.replace(prompt_text, ''),
+                "content": model_output,
             }
 
-            response_text = model_output.replace(prompt_text, '').strip()
+            response_text = model_output
 
         else:  # default (text completion)
             prompt = "\n".join([message["content"] for message in messages])

From bdbc71a4ea788a5bfb27b4a4fa586e9970cabb0e Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Thu, 16 Nov 2023 23:36:14 +0100
Subject: [PATCH 5/6] Add deepcopy of input messages to prevent reference
 issues

---
 backends/huggingface_local_api.py | 20 ++++++++++++--------
 backends/llama2_hf_local_api.py   | 22 +++++++++++++---------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py
index c9b1a771fa..b00c4ab9bb 100644
--- a/backends/huggingface_local_api.py
+++ b/backends/huggingface_local_api.py
@@ -6,6 +6,7 @@
 import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
+import copy
 
 logger = backends.get_logger(__name__)
 
@@ -153,17 +154,20 @@ def generate_response(self, messages: List[Dict], model: str,
             logger.info(f"Finished loading huggingface model: {model}")
             logger.info(f"Model device map: {self.model.hf_device_map}")
 
+        # deepcopy messages to prevent reference issues:
+        current_messages = copy.deepcopy(messages)
+
         # flatten consecutive user messages:
-        for msg_idx, message in enumerate(messages):
-            if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user":
-                messages[msg_idx - 1]['content'] += f" {message['content']}"
-                del messages[msg_idx]
-            elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant":
-                messages[msg_idx - 1]['content'] += f" {message['content']}"
-                del messages[msg_idx]
+        for msg_idx, message in enumerate(current_messages):
+            if msg_idx > 0 and message['role'] == "user" and current_messages[msg_idx - 1]['role'] == "user":
+                current_messages[msg_idx - 1]['content'] += f" {message['content']}"
+                del current_messages[msg_idx]
+            elif msg_idx > 0 and message['role'] == "assistant" and current_messages[msg_idx - 1]['role'] == "assistant":
+                current_messages[msg_idx - 1]['content'] += f" {message['content']}"
+                del current_messages[msg_idx]
 
         # apply chat template & tokenize:
-        prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
+        prompt_tokens = self.tokenizer.apply_chat_template(current_messages, return_tensors="pt")
         prompt_tokens = prompt_tokens.to(self.device)
 
         prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0]
diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index d0809970f4..280082b984 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -6,6 +6,7 @@
 import transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
+import copy
 
 logger = backends.get_logger(__name__)
 
@@ -89,18 +90,21 @@ def generate_response(self, messages: List[Dict], model: str,
         # turn off redundant transformers warnings:
         transformers.logging.set_verbosity_error()
 
+        # deepcopy messages to prevent reference issues:
+        current_messages = copy.deepcopy(messages)
+
         if model in self.chat_models:  # chat completion
             # flatten consecutive user messages:
-            for msg_idx, message in enumerate(messages):
-                if msg_idx > 0 and message['role'] == "user" and messages[msg_idx - 1]['role'] == "user":
-                    messages[msg_idx - 1]['content'] += f" {message['content']}"
-                    del messages[msg_idx]
-                elif msg_idx > 0 and message['role'] == "assistant" and messages[msg_idx - 1]['role'] == "assistant":
-                    messages[msg_idx - 1]['content'] += f" {message['content']}"
-                    del messages[msg_idx]
+            for msg_idx, message in enumerate(current_messages):
+                if msg_idx > 0 and message['role'] == "user" and current_messages[msg_idx - 1]['role'] == "user":
+                    current_messages[msg_idx - 1]['content'] += f" {message['content']}"
+                    del current_messages[msg_idx]
+                elif msg_idx > 0 and message['role'] == "assistant" and current_messages[msg_idx - 1]['role'] == "assistant":
+                    current_messages[msg_idx - 1]['content'] += f" {message['content']}"
+                    del current_messages[msg_idx]
 
             # apply chat template & tokenize
-            prompt_tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
+            prompt_tokens = self.tokenizer.apply_chat_template(current_messages, return_tensors="pt")
             prompt_tokens = prompt_tokens.to(self.device)
             # apply chat template for records:
             prompt_text = self.tokenizer.batch_decode(prompt_tokens)[0]
@@ -137,7 +141,7 @@ def generate_response(self, messages: List[Dict], model: str,
             response_text = model_output
 
         else:  # default (text completion)
-            prompt = "\n".join([message["content"] for message in messages])
+            prompt = "\n".join([message["content"] for message in current_messages])
 
             prompt_tokens = self.tokenizer.encode(
                 prompt,

From a9cd314563f34725029fc21471e611151db0d87c Mon Sep 17 00:00:00 2001
From: Gnurro <knawurzelkopp@hotmail.de>
Date: Tue, 21 Nov 2023 13:33:21 +0100
Subject: [PATCH 6/6] Change returned response dict to proper format containing
 complete model output

---
 backends/huggingface_local_api.py |  3 ++-
 backends/llama2_hf_local_api.py   | 14 ++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/backends/huggingface_local_api.py b/backends/huggingface_local_api.py
index b00c4ab9bb..7c51b0d36e 100644
--- a/backends/huggingface_local_api.py
+++ b/backends/huggingface_local_api.py
@@ -198,6 +198,8 @@ def generate_response(self, messages: List[Dict], model: str,
 
         model_output = self.tokenizer.batch_decode(model_output_ids)[0]
 
+        response = {'response': model_output}
+
         # cull input context; equivalent to transformers.pipeline method:
         if not return_full_text:
             response_text = model_output.replace(prompt_text, '').strip()
@@ -207,7 +209,6 @@ def generate_response(self, messages: List[Dict], model: str,
         else:
             response_text = model_output.strip()
 
-        response = {'response': response_text}
         return prompt, response, response_text
 
     def supports(self, model_name: str):
diff --git a/backends/llama2_hf_local_api.py b/backends/llama2_hf_local_api.py
index 280082b984..0c164b2c3d 100644
--- a/backends/llama2_hf_local_api.py
+++ b/backends/llama2_hf_local_api.py
@@ -127,18 +127,16 @@ def generate_response(self, messages: List[Dict], model: str,
                 )
 
             model_output = self.tokenizer.batch_decode(model_output_ids)[0]
+
+            response = {"response": model_output}
+
             # cull prompt from output:
-            model_output = model_output.replace(prompt_text, "").strip()
+            response_text = model_output.replace(prompt_text, "").strip()
             # remove EOS token at the end of output:
-            if model_output[-4:len(model_output)] == "</s>":
-                model_output = model_output[:-4]
+            if response_text[-4:len(response_text)] == "</s>":
+                response_text = response_text[:-4]
 
-            response = {
-                "role": "assistant",
-                "content": model_output,
-            }
 
-            response_text = model_output
 
         else:  # default (text completion)
             prompt = "\n".join([message["content"] for message in current_messages])