In [None]:
from human_eval.data import write_jsonl, read_problems
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

from human_eval.evaluation import evaluate_functional_correctness
from molm.utils.utils import extract_generation_code, languge_settings



def build_qwen_instruction(question: str, languge: str = 'python', ):
    return """
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
```{}
{}
```
""".strip().format(
        languge.lower(), question.strip()
    )

def generate_one_completion(question, model, tokenizer, max_new_tokens=1024):
    """Generate a single completion for a given prompt"""
    prompt = build_qwen_instruction(question)
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )
    
    completion = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
    # code = extract_generation_code(completion, lang_code='python', verbose=True)
    return completion

def main():
    model_name = "Qwen/Qwen2.5-Coder-7B"  
    print(f"Loading {model_name}...")
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    problems = read_problems()
    num_samples_per_task = 1  # Increase this for pass@k metrics
    
    print(f"Generating {num_samples_per_task} samples per task...")
    samples = []
    
    for task_id in tqdm(problems.keys(), desc="Generating solutions"):
        for _ in range(num_samples_per_task):
            try:
                completion = generate_one_completion(problems[task_id]["prompt"], model, tokenizer)
                samples.append({
                    "task_id": task_id,
                    "completion": completion
                })
            except Exception as e:
                print(f"\nError on task {task_id}: {str(e)}")
    
    output_file = f"samples_{model_name.replace('/', '_')}.json"
    write_jsonl(output_file, samples)
    print(f"Samples saved to {output_file}")
    print("\nTo evaluate, run:")
    result = evaluate_functional_correctness(
        sample_file=output_file,
    )

if __name__ == "__main__":
    main()

Loading Qwen/Qwen2.5-Coder-7B...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating 1 samples per task...


Generating solutions:   0%|          | 0/164 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   1%|          | 1/164 [00:17<48:17, 17.78s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   1%|          | 2/164 [00:27<35:55, 13.31s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   2%|▏         | 3/164 [00:51<48:51, 18.21s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   2%|▏         | 4/164 [00:57<35:00, 13.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   3%|▎         | 5/164 [00:58<23:10,  8.74s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   4%|▎         | 6/164 [00:59<16:14,  6.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating solutions:   4%|▍