In [None]:
from aces.aces import ACES
from transformers import HfArgumentParser, TrainingArguments, set_seed, DataCollatorForSeq2Seq
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class AcesArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.DataTrainingArguments
    """

    environement_name : str = field( default = "p3", metadata={"help": "environment name"})
    path_archive : str = field( default = "/home/flowers/work/aces/aces/environement/p3/preprocess_p3_emb_dedup_puzzles.json", metadata={"help": "path to the archive"})
    num_solutions: int = field( default = 10, metadata={"help": "number of solutions to generate to compute the difficulty score"})
    
@dataclass
class QdArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.DataTrainingArguments
    """

    a: str = field(
        default="/home/flowers/work/hf/Qwen2.5-0.5B-Instruct",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )

@dataclass
class LLMArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.DataTrainingArguments
    """

    model_name_or_path: str = field(
        default="/home/flowers/work/hf/Qwen2.5-0.5B-Instruct",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    online: Optional[bool] = field(
        default = False,
        metadata={
            "help": "use vllm server if True else use offline vllm"
        },
    )
    base_url: Optional[str] = field(
        default="http://localhost:8000",
        metadata={
            "help": "base url for vllm server"
        },
    )
    api_key: Optional[str] = field(
        default="",
        metadata={
            "help": "api key "
        },
    )
    gpu: Optional[bool] = field(
        default = 1,
        metadata={
            "help": "number of gpus to use (vllm)"
        },
    )
    cfg_generation : Optional[bool] = field(
        default = False,
        metadata={
            "help": "use cfg generation"
        },
    ),
    temperature: Optional[float] = field(
        default = 1.0,
        metadata={
            "help": "temperature"
        },
    )
    max_tokens: Optional[int] = field(
        default = 4000,
        metadata={
            "help": "max tokens"
        },
    )

# parser = HfArgumentParser((AcesArguments,QdArguments,LLMArguments))
# model_args, data_args, training_args = parser.parse_args_into_dataclasses()#["--output_dir", "/home/flowers/work/hf/trained/"])
aces_args, qd_args, llm_args = AcesArguments(), QdArguments(), LLMArguments()


In [2]:
aces= ACES(aces_args, qd_args, llm_args)

init LLM client
INFO 11-29 18:55:38 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 11-29 18:55:38 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/home/flowers/work/hf/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='/home/flowers/work/hf/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=30000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), s

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.59it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.58it/s]


INFO 11-29 18:55:39 model_runner.py:1077] Loading model weights took 0.9276 GB





INFO 11-29 18:55:40 worker.py:232] Memory profiling results: total_gpu_memory=15.70GiB initial_memory_usage=1.32GiB peak_torch_memory=2.36GiB memory_usage_post_profile=1.36GiB non_torch_memory=0.42GiB kv_cache_size=11.34GiB gpu_memory_utilization=0.90
INFO 11-29 18:55:40 gpu_executor.py:113] # GPU blocks: 61940, # CPU blocks: 21845
INFO 11-29 18:55:40 gpu_executor.py:117] Maximum concurrency for 30000 tokens per request: 33.03x
INFO 11-29 18:55:41 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-29 18:55:41 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-29 18:55:48 model_runner.py:1518] Graph capturing finished in 6 secs, took

In [2]:
from aces.environement.p3.prompt import get_programming_puzzles_prompt
from aces.environement.p3.p3_genotype import P3


In [5]:
p3_1 = P3(program_str="puzzle test1", emb=[1,0,1,0,0],fitness=0.5 )
p3_2 = P3(program_str="puzzle test2", emb=[1,0,1,0,0],fitness=0.5 )
list_p3 = [p3_1, p3_2]
skill_targeted=[1,0,1,0,1]

In [7]:
print(get_programming_puzzles_prompt(list_p3,skill_targeted,n_fewshot_ex=2))

Consider Python Programming Puzzles (P3). P3 consists of two functions: a problem function `f` and its corresponding solution `g`. The challenge lies in constructing a SAT problem `f` and a function `g` such that `f(g())` evaluates to `True`

## Main Rules:
- Each puzzle includes two functions: `def f(...)` and `def g(...)`.
- The first argument of `f` is always the output from `g()`.
- Ensure `f` and `g` have matching argument signatures (e.g., `def f(solution, arg1=value1, arg2=value2, ...)` and `def g(arg1=value1, arg2=value2, ...)`). You also need to set the value of argument of f (arg1,arg2,...) and g when you define them.
- Avoid using `f` inside `g`, and `g` inside `f`.
- Include any necessary imports so your code runs smoothly.
- Give a clear Puzzle description that must be brief and diverse compared to the other puzzles.
- Make sure the puzzle is self-contained within these two functions.
- Make sure that that each puzzle have just all required skills (see below)

## P3 Format

In [1]:
import json
path="/home/flowers/work/aces/aces/environement/p3/preprocess_p3_emb_dedup_puzzles.json"
with open(path, 'r') as f:
    data = json.load(f) 

In [None]:
data[0]
# suppose we only have "program_str"

{'program_str': 'def f(n: int) -> bool:\n    return str(n * n).startswith(\'123456789\')\ndef g():\n    return int(int("123456789" + "0" * 9) ** 0.5) + 1\nassert f(g()) == True',
 'emb': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 'explanation_emb': 'This puzzle involves string manipulation to check if the square of a number starts with a specific sequence of digits, and mathematical operations to calculate the square root of a number. The puzzle also requires an understanding of number theory, specifically the concept of square roots.\n\nThe list of skills used is: [0, 1, 16].',
 'description': "Find the solution: n (an integer) that should be squared and its result in string format starts with '123456789'.",
 'quality': 1,
 'fitness': -0.020000000000000018,
 'all_solution': ['def f(n: int) -> bool:\n    return str(n * n).startswith(\'123456789\')\ndef g():\n    return int(int("123456789" + "0" * 9) ** 0.5) + 1\nassert f(g()) == True\nassert f(g()) == True',
  "def 

In [20]:
from aces.code_sandbox import evaluate, pass_at_k
str_to_add=str(
            f"\ndef run_eval():\n"
            f"    return f(g()) == True"
        )

list_codes=["def f(x):\n    return x\ndef g():\n    return True", "def f(x):\n    return x\ndef g():\n    return False",
            "def f(x):\n    return not x\ndef g():\n    return False", "def f(x):\n    return not x\ndef g():\n    return True"]


list_task_id=[0,0,1,1]
for i in range(len(list_codes)):
    list_codes[i] = list_codes[i]+str_to_add

res = evaluate(list_codes, list_task_id,entry_point="run_eval")

24


100%|██████████| 4/4 [00:00<00:00, 394.90it/s]


In [None]:
res["raw_result"]

{'date': '2024-12-01 18:59',
 'eval': {0: [True, False], 1: [True, False]},
 'pass@k': {0: 0.5, 1: 0.5},
 'raw_result': defaultdict(list,
             {0: [{'completion_id': 0,
                'task_id': 0,
                '_identifier': 5,
                'code': 'def f(x):\n    return x\ndef g():\n    return True\ndef run_eval():\n    return f(g()) == True',
                'result': 'pass',
                'correct': True},
               {'completion_id': 1,
                'task_id': 0,
                '_identifier': 5,
                'code': 'def f(x):\n    return x\ndef g():\n    return False\ndef run_eval():\n    return f(g()) == True',
                'result': 'fail',
                'correct': False}],
              1: [{'completion_id': 2,
                'task_id': 1,
                '_identifier': 5,
                'code': 'def f(x):\n    return not x\ndef g():\n    return False\ndef run_eval():\n    return f(g()) == True',
                'result': 'pass',
            

In [17]:
list_task_id=[0,1]
for task_id in list_task_id:
    all_solution = []
    all_solution_correct = []
    for id_completion in range(len(res["raw_result"][task_id])):
        all_solution.append(res["raw_result"][task_id][id_completion]["code"].split(str_to_add)[0])
        all_solution_correct.append(res["raw_result"][task_id][id_completion]["correct"])


In [19]:
all_solution

['def f(x):\n    return not x\ndef g():\n    return False',
 'def f(x):\n    return not x\ndef g():\n    return True']

In [18]:
all_solution_correct

[True, False]

In [21]:
number_solution = len(all_solution)
c = sum(all_solution_correct)
k=1
pass_at_k(n=number_solution, c=c, k=k)

0.5

In [15]:
id_completion = 0
res["raw_result"][task_id][0]["code"].split(str_to_add)[0]


NameError: name 'task_id' is not defined

In [14]:
res["raw_result"][0][0]["correct"]

True