In [3]:
import asyncio
import itertools
import json
import re

import nest_asyncio
import pandas as pd
import tqdm
from aiolimiter import AsyncLimiter
from g4f.client import AsyncClient
from googletrans import Translator
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

## Refactor and save tasks dict

In [4]:
CLEAN_TAGS_R = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")


def remove_html_tags(text: str) -> str:
    return re.sub(CLEAN_TAGS_R, "", text)

In [5]:
def batch_list(strings: list[str], batch_size: int) -> list[list[str]]:
    return [strings[i : i + batch_size] for i in range(0, len(strings), batch_size)]


async def translate_text(text: str, src: str = "ru", dest: str = "en") -> str:
    translator = Translator()
    translation = await translator.translate(text, src=src, dest=dest)
    return translation.text


async def translate_texts_inner(
    texts: list[str], src: str = "ru", dest: str = "en"
) -> list[str]:
    translator = Translator()
    if len(texts) > 1:
        return [res.text for res in await translator.translate(texts, src=src, dest=dest)]
    res = await translator.translate(texts[0], src=src, dest=dest)
    return [res.text]


async def translate_texts(texts: list[str]) -> list[str]:
    res = await tqdm_asyncio.gather(
        *[translate_texts_inner(text_list) for text_list in batch_list(texts, 5)]
    )

    ans = []
    for r in res:
        ans.extend(r)
    return ans

In [6]:
def save_dict(tasks_dict: dict, path: str):
    with open(path, "w", encoding="utf8") as f:
        json.dump(tasks_dict, f, ensure_ascii=False, indent=4)


def build_task_str(task: dict) -> str:
    return remove_html_tags(
        f"{task['title']}\n\n"
        + f"Теги:{task['tags']}\n\n"
        + f"{task['description']}\n\n"
        + f"Формат входных данных:\n{task['inputFormat']}\n\n"
        + f"Формат выходных данных:\n{task['outputFormat']}\n\n"
        + f"Примеры\n{task['examples']}"
        + (f"\n\nПримечание:\n{task['remark']}" if task["remark"] != "" else "")
    )


def compile_tasks_dict() -> dict[str, str]:
    df = pd.read_csv("../../data/db_tasks.csv")
    raw_dict = df.fillna("").set_index("spec").to_dict(orient="index")

    return {str(spec): build_task_str(task) for spec, task in raw_dict.items()}


async def save_dicts():
    tasks_dict = compile_tasks_dict()
    save_dict(tasks_dict, "../../data/generated/db_tasks_ru.json")

    specs, values = zip(*tasks_dict.items())
    translated_values = await translate_texts(list(values))
    tasks_dict_en = dict(zip(specs, translated_values))
    save_dict(tasks_dict_en, "../../data/generated/db_tasks_en.json")


# await save_dicts()

## Load tasks dict

In [7]:
def load_dict(path: str) -> dict:
    try:
        with open(path, "r", encoding="utf8") as f:
            return json.load(f)
    except FileNotFoundError:
        return {}


tasks_dict: dict[str, str] = load_dict("../../data/generated/db_tasks_en.json")
len(tasks_dict)

553

## Generate solutions

In [1]:
AVAILABLE_MODELS = [
    "gemini-1.5-pro",
    "mixtral-7b",
    "hermes-3",
]  #   "gpt-4",  "gpt-4o", "llama-3.1-8b",
PROMPTS = [
    # lambda task: f"Please provide python code without any external imports to solve the task below. Provide pure code without any explanations and comments. Make sure that it looks like written by human. Produce correct input/output logic. Here is the task: \n{task}",
    lambda task: f"""Write a Python solution for the following task. The code should look like it was written by an intermediate human developer—natural, readable, and practical but not overly optimized or perfect. Follow these guidelines:
Variable Names: Use descriptive but not overly long names (e.g., res instead of result or final_output_value).
Comments: Include a no comments.
Structure: Use simple functions or straightforward logic unless the problem demands complexity.
Imperfections: Small stylistic quirks are okay (e.g., mixing single/double quotes, occasional redundant logic).
Error Handling: Basic checks (e.g., for edge cases) but no excessive validation unless required.
Formatting: Use standard PEP 8 style but allow minor deviations (e.g., inconsistent spacing in one place).
Task: \n{task}.""",
]

In [3]:
PROMPTS[0]("какой-то текс задачи")

'Write a Python solution for the following task. The code should look like it was written by an intermediate human developer—natural, readable, and practical but not overly optimized or perfect. Follow these guidelines:\nVariable Names: Use descriptive but not overly long names (e.g., res instead of result or final_output_value).\nComments: Include a no comments.\nStructure: Use simple functions or straightforward logic unless the problem demands complexity.\nImperfections: Small stylistic quirks are okay (e.g., mixing single/double quotes, occasional redundant logic).\nError Handling: Basic checks (e.g., for edge cases) but no excessive validation unless required.\nFormatting: Use standard PEP 8 style but allow minor deviations (e.g., inconsistent spacing in one place).\nTask: \nкакой-то текс задачи.'

In [8]:
def extract_python_code(res: str) -> str:
    try:
        return re.findall(r"```python(.*)```", res, re.DOTALL)[0]
    except IndexError:
        return res


async def access_llm(model: str, prompt: str):
    try:
        client = AsyncClient()
        response = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            # web_search = False
        )
        return extract_python_code(response.choices[0].message.content)
    except Exception as _:
        return ""


async def generate_solutions(
    task_dict: dict,
    prompts=PROMPTS,
    models: list[str] = AVAILABLE_MODELS,
    repeats: int = 1,
    existing_prompts: dict = {},
) -> tuple[list, dict]:
    solutions = []
    shift = len(set(existing_prompts.values()) - set([p("$task$") for p in prompts]))
    prompts_dict = {str(idx + shift): prompt for idx, prompt in enumerate(prompts)}

    parameters = list(itertools.product(models, prompts_dict.keys()))

    async with AsyncLimiter(max_rate=10, time_period=1):  # 10 requests per second
        for spec, task in tqdm.tqdm(task_dict.items()):
            for _ in range(repeats):  # Add some history in case of repeats>1
                # client = AsyncClient()
                results = await asyncio.gather(
                    *[access_llm(m, prompts_dict[p](task)) for m, p in parameters]
                )
                await asyncio.sleep(1)

                for solution, (m, p) in zip(results, parameters):
                    solutions.append((spec, p, m, solution))

    new_prompts_dict = {k: v("$task$") for k, v in prompts_dict.items()}
    new_prompts_dict.update(existing_prompts)

    return solutions, new_prompts_dict


def update_generated(generated_solutions: list, new_prompts_dict: dict[str, str]):
    save_dict(new_prompts_dict, "../../data/generated/prompts.json")

    df = pd.DataFrame(
        generated_solutions, columns=["spec", "prompt_idx", "model", "solution"]
    )
    try:
        existing_df = pd.read_csv("../../data/generated/gen_solutions.csv")
        pd.concat([existing_df, df], ignore_index=True).to_csv(
            "../../data/generated/gen_solutions.csv", index=False
        )
    except FileNotFoundError:
        df.to_csv("../../data/generated/gen_solutions.csv", index=False)

In [9]:
# test_dict = {k: tasks_dict[k] for k in ["c0df7d49-26f5-451c-b44a-1e0bca60bca5", "4e5b21c0-e86f-4eac-82b6-1a0d00ae4199"]}

existing_prompts = load_dict("../../data/generated/prompts.json")
# generated_solutions, new_prompts_dict = await generate_solutions(test_dict, existing_prompts=existing_prompts)
generated_solutions, new_prompts_dict = await generate_solutions(
    tasks_dict, existing_prompts=existing_prompts
)

  0%|          | 0/553 [00:00<?, ?it/s]

  1%|          | 6/553 [00:28<41:38,  4.57s/it]  Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000022C7F62DAF0>, 44009.109)])']
connector: <aiohttp.connector.TCPConnector object at 0x0000022C6B10F920>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000022C7F60BC80>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000022C7F62F3B0>, 44013.046)])']
connector: <aiohttp.connector.TCPConnector object at 0x0000022C7F540AD0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000022C7F540A10>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000022C7F62F170>, 44017.859)])']
connector: <aiohttp.connector.TCPConnector object at 0x0000022C7F5409E0>
  2%|▏         | 10/553 [00:48<43:00,  4.75s/it]Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000022C7F5E7560>
Unclos

In [10]:
update_generated(generated_solutions, new_prompts_dict)