dumpmemory · pull · May 29, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/codegen/generate.py b/codegen/generate.py
@@ -1,3 +1,4 @@
+import json
 import os
 from os import PathLike
 from typing import List
@@ -36,7 +37,7 @@ def construct_contract_prompt(prompt: str, contract_type: str, contract: str) ->
 
 
 def codegen(
-    workdir: PathLike,
+    target_path: PathLike,
     model: DecoderBase,
     dataset: str,
     greedy=False,
@@ -45,6 +46,15 @@ def codegen(
     version="default",
     resume=True,
 ):
+    task2nexist = {}
+    if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
+        with open(target_path, "r") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                task_id = json.loads(line)["task_id"]
+                task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
+
     with Progress(
         TextColumn(f"{dataset} •" + "[progress.percentage]{task.percentage:>3.0f}%"),
         BarColumn(),
@@ -69,26 +79,26 @@ def codegen(
                     p.console.print(f"Skipping {task_id} as it is not in {id_range}")
                     continue
 
-            p_name = task_id.replace("/", "_")
-            os.makedirs(os.path.join(workdir, p_name), exist_ok=True)
-            log = f"Codegen: {p_name} @ {model}"
-            n_existing = 0
-            if resume:
-                # count existing .py files
-                n_existing = len(
+            if not target_path.endswith(".jsonl"):
+                p_name = task_id.replace("/", "_")
+                os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
+                task2nexist[task_id] = len(
                     [
                         f
-                        for f in os.listdir(os.path.join(workdir, p_name))
+                        for f in os.listdir(os.path.join(target_path, p_name))
                         if f.endswith(".py")
                     ]
                 )
-                if n_existing > 0:
-                    log += f" (resuming from {n_existing})"
 
-            nsamples = n_samples - n_existing
+            n_more_samples = n_samples
+            log = f"Codegen: {task_id} @ {model}"
+            if resume and task2nexist.get(task_id, 0) > 0:
+                log += f" (resuming from {task2nexist[task_id]})"
+                n_more_samples -= task2nexist[task_id]
+
             p.console.print(log)
 
-            sidx = n_samples - nsamples
+            sidx = n_samples - n_more_samples
             while sidx < n_samples:
                 outputs = model.codegen(
                     task["prompt"],
@@ -97,18 +107,22 @@ def codegen(
                 )
                 assert outputs, "No outputs from model!"
                 for impl in outputs:
-                    try:
+                    solution = (
+                        task["prompt"] + impl if model.is_direct_completion() else impl
+                    )
+                    if target_path.endswith(".jsonl"):
+                        with open(target_path, "a") as f:
+                            f.write(
+                                json.dumps({"task_id": task_id, "solution": solution})
+                                + "\n"
+                            )
+                    else:
                         with open(
-                            os.path.join(workdir, p_name, f"{sidx}.py"),
+                            os.path.join(target_path, p_name, f"{sidx}.py"),
                             "w",
                             encoding="utf-8",
                         ) as f:
-                            if model.is_direct_completion():
-                                f.write(task["prompt"] + impl)
-                            else:
-                                f.write(impl)
-                    except UnicodeEncodeError:
-                        continue
+                            f.write(solution)
                     sidx += 1
 
 
@@ -126,12 +140,19 @@ def main(
     backend: str = "vllm",
     base_url: str = None,
     tp: int = 1,
+    evalperf_type: str = None,  # This is for EvalPerf
+    jsonl_fmt: bool = False,
 ):
     assert dataset in ["humaneval", "mbpp"], f"Invalid dataset {dataset}"
     assert backend in ["vllm", "hf", "openai"]
+    assert evalperf_type is None or evalperf_type in [
+        "instruct",
+        "perf-instruct",
+        "perf-CoT",
+    ]
 
     if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
-        temperature = 0
+        temperature = 0.0
         bs = 1
         n_samples = 1
         print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
@@ -145,7 +166,21 @@ def main(
     os.makedirs(root, exist_ok=True)
     # Make dataset dir
     os.makedirs(os.path.join(root, dataset), exist_ok=True)
-    # Make dir for codes generated by each model
+
+    # Model instructions
+    instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
+    response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
+
+    if evalperf_type == "perf-instruct":
+        instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
+        response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
+    elif evalperf_type == "perf-CoT":
+        instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
+        response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
+    elif evalperf_type is not None and evalperf_type != "instruct":
+        raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
+
+    # Model creation
     model_runner = make_model(
         model=model,
         backend=backend,
@@ -154,12 +189,22 @@ def main(
         dataset=dataset,
         base_url=base_url,
         tp=tp,
+        instruction_prefix=instruction_prefix,
+        response_prefix=response_prefix,
     )
+
+    # Make dir for codes generated by each model
     identifier = model.replace("/", "--") + f"_{backend}_temp_{temperature}"
-    workdir = os.path.join(root, dataset, identifier)
-    os.makedirs(workdir, exist_ok=True)
+    if evalperf_type:
+        identifier += f"-{evalperf_type}"
+
+    target_path = os.path.join(root, dataset, identifier)
+    if jsonl_fmt:
+        target_path += ".jsonl"
+    else:
+        os.makedirs(target_path, exist_ok=True)
     codegen(
-        workdir=workdir,
+        target_path=target_path,
         dataset=dataset,
         greedy=greedy,
         model=model_runner,

diff --git a/codegen/model.py b/codegen/model.py
@@ -53,31 +53,39 @@ def extra_eos_for_direct_completion(dataset) -> List[str]:
 _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
 
 
-def make_chat_prompt(prompt: str, tokenizer: AutoTokenizer) -> str:
+def make_chat_prompt(
+    task_prompt: str,
+    instruction_prefix: str,
+    response_prefix: str,
+    tokenizer: AutoTokenizer,
+) -> str:
     # directly return prompt if it does not have a tokenizer.chat_template
     if tokenizer.chat_template is None:
-        return prompt
+        return task_prompt
 
-    prompt = f"""\
-Please provide a self-contained Python script that solves the following problem in a markdown code block:
+    assert instruction_prefix is not None, "Instruction prefix is required!"
+    assert response_prefix is not None, "Response prefix is required!"
+
+    task_prompt = f"""\
+{instruction_prefix}
 ```
-{prompt.strip()}
+{task_prompt.strip()}
 ```
 """
     response = f"""\
-Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:
+{response_prefix}
 ```python
 {_MAGIC_SPLITTER_}
 ```
 """
-    prompt = tokenizer.apply_chat_template(
+    task_prompt = tokenizer.apply_chat_template(
         [
-            {"role": "user", "content": prompt},
+            {"role": "user", "content": task_prompt},
             {"role": "assistant", "content": response},
         ],
         tokenize=False,
     ).split(_MAGIC_SPLITTER_)[0]
-    return prompt
+    return task_prompt
 
 
 class DecoderBase(ABC):
@@ -89,6 +97,8 @@ def __init__(
         max_new_tokens: int = 512,
         dtype: str = "bfloat16",  # default
         trust_remote_code: bool = False,
+        instruction_prefix: str = None,
+        response_prefix: str = None,
     ) -> None:
         print("Initializing a decoder model: {} ...".format(name))
         self.name = name
@@ -99,6 +109,8 @@ def __init__(
         self.max_new_tokens = max_new_tokens
         self.dtype = dtype
         self.trust_remote_code = trust_remote_code
+        self.instruction_prefix = instruction_prefix
+        self.response_prefix = response_prefix
 
     @abstractmethod
     def codegen(
@@ -166,7 +178,9 @@ def __init__(self, name: str, **kwargs) -> None:
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(
+            prompt, self.instruction_prefix, self.response_prefix, self.tokenizer
+        )
         return VllmDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -255,7 +269,9 @@ def __init__(self, name: str, **kwargs):
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(
+            prompt, self.instruction_prefix, self.response_prefix, self.tokenizer
+        )
         return HfTorchDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -271,14 +287,15 @@ def codegen(
             assert self.temperature > 0, "Temperature must be positive for sampling"
         batch_size = min(self.batch_size, num_samples)
 
+        message = self.instruction_prefix
         # construct prompt
+        message += f"\n```python\n{prompt.strip()}\n```"
+
         fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text"
         if fmt == "json_object":
-            message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
-        else:
-            message = r"Please generate code to complete the following problem:"
-
-        message += f"\n```python\n{prompt.strip()}\n```"
+            message += (
+                r'Note: the output code should follow a JSON schema of {"code": ""}'
+            )
 
         ret = openai_request.make_auto_request(
             self.client,
@@ -337,7 +354,7 @@ def codegen(
                 messages=[
                     ChatMessage(
                         role="user",
-                        content="Please generate code to solve the following problem in a Python markdown block:"
+                        content=self.instruction_prefix
                         + f"\n```python\n{prompt.strip()}\n```",
                     )
                 ],
@@ -381,7 +398,7 @@ def codegen(
                 messages=[
                     {
                         "role": "user",
-                        "content": "Please generate code to complete the following problem wrapped in a Python markdown block:"
+                        "content": self.instruction_prefix
                         + f"\n```python\n{prompt.strip()}\n```\n",
                     }
                 ],
@@ -402,6 +419,8 @@ def make_model(
     temperature: float = 0.0,
     tp=1,
     base_url=None,
+    instruction_prefix=None,
+    response_prefix=None,
 ):
     if backend == "vllm":
         return GeneralVllmDecoder(
@@ -410,30 +429,40 @@ def make_model(
             temperature=temperature,
             dataset=dataset,
             tp=tp,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )
     elif backend == "hf":
         return GenenralHfTorchDecoder(
             name=model,
             batch_size=batch_size,
             temperature=temperature,
             dataset=dataset,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )
     elif backend == "openai":
         return OpenAIChatDecoder(
             name=model,
             batch_size=batch_size,
             temperature=temperature,
             base_url=base_url,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )
     elif backend == "mistral":
         return MistralChatDecoder(
             name=model,
             batch_size=batch_size,
             temperature=temperature,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )
     elif backend == "anthropic":
         return AnthropicMessageDecoder(
             name=model,
             batch_size=batch_size,
             temperature=temperature,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )