# Подготовка датасета - 2 часть, генерация по файлам

### Импорты и сетап окружения

In [1]:
import datasets
import os
import pandas as pd
import subprocess
import re
from tqdm.notebook import tqdm
from pathlib import Path
import asyncio
import aiopath
import sys
import time
import transformers
import torch

ROOT_DIR = str(aiopath.AsyncPath(os.getcwd()).parent)
os.chdir(ROOT_DIR)
DATA_DIR = ROOT_DIR+'/data'
REPOS_DIR = DATA_DIR+'/repos'
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'

from evaluating.generator import make_model_with_tokenizer, generate_completions
from scoring.scoring import Scorer

gvm_root = os.environ.get('GVM_ROOT')
os.environ['PATH'] = f"{gvm_root}/bin:{gvm_root}/pkgsets/go1.24.2/global/bin:{gvm_root}/gos/go1.24.2/bin:{gvm_root}/pkgsets/go1.24.2/global/overlay/bin:{os.environ['PATH']}"

В части 1 [dataset_0](./dataset_0.ipynb) подготовлен датасет test_candidates_ds с отфильтрованными файлами

In [2]:
test_candidates_ds = datasets.load_from_disk(DATA_DIR+'/test_candidates_ds')

### Записываем промпты в датасет

In [3]:
system_message = """
You are an expert programmer. 
You should only return output test file containing working code.
The user is going to give you code and would like to have unit tests for the first file.
All the other files are just dependencies to give you context of all the possible test cases to produce.
Cover all possible inputs and their respective outputs using tests.
Each subtest must be wrapped into t.Run
"""

def get_prompt(row) -> str:
    file_path = '../data/repos/'+row['project_path']+row['relative_project_path']
    f = open(file_path, 'r')
    file_content = f.read()
    f.close()

    prompt = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"{file_content}"}
    ]

    return prompt

def finalize_row(row) -> dict:
    return {
        'project_path': row['project_path'],
        'relative_package_path': row['relative_go_package'],
        'relative_file_path': row['relative_project_path'],
        'prompt': get_prompt(row)
    }

by_file_ds = test_candidates_ds.map(finalize_row, num_proc=32).select_columns(['project_path', 'relative_package_path', 'relative_file_path', 'prompt'])

by_file_ds.save_to_disk('../data/by_file_ds')

print(by_file_ds)
by_file_ds[0]

Saving the dataset (0/1 shards):   0%|          | 0/33251 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt'],
    num_rows: 33251
})


{'project_path': '766dc882d779f07821bde740ce49802f67ae42b3/backend/',
 'relative_package_path': 'controllers/',
 'relative_file_path': 'controllers/controllers.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package controllers\n\nimport (\n\t"encoding/json"\n\t"fmt"\n\techo "github.com/labstack/echo/v4"\n\t"golang.org/x/net/websocket"\n\t"stream/models"\n)\n\n// Controller interface has two methods\ntype Controller interface {\n\t// Homecontroller renders initial home page\n\tHomeController(e echo.Context) error\n\n\t// StreamController responds with live cpu statu

### Генерируем completions исходной моделью `deepseek-ai/deepseek-coder-1.3b-instruct`

In [None]:
by_file_ds = datasets.load_from_disk(DATA_DIR+'/by_file_ds')

In [5]:
by_file_test_ds = by_file_ds.shuffle().take(500)

by_file_test_ds.save_to_disk('../data/by_file_test_ds')

print(by_file_test_ds)
by_file_test_ds[0]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

In [2]:
by_file_test_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_ds')

In [7]:
tokenizer, model = make_model_with_tokenizer('original')

def generate_completion_for_row(row: dict) -> dict:
    start = time.time()
    completion = generate_completions(tokenizer, model, [row['prompt']])[0]
    return {
        'generate_time': time.time()-start,
        'completion': completion,
    }

by_file_test_with_completions_ds = by_file_test_ds.map(generate_completion_for_row, num_proc=1)

by_file_test_with_completions_ds.save_to_disk(DATA_DIR+'/by_file_test_with_completions_ds')
del model, tokenizer

print(by_file_test_with_completions_ds)
by_file_test_with_completions_ds[0]

2025-05-19 03:26:59.243822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747614419.373649    7231 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747614419.411928    7231 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747614419.701020    7231 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747614419.701056    7231 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747614419.701060    7231 computation_placer.cc:177] computation placer alr

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

In [3]:
# qwen

model_name = 'Qwen/Qwen2.5-Coder-1.5B'

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    #torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    #use_cache=False,
    #low_cpu_mem_usage=True,
    #local_files_only=True,
    # quantization_config=bnb_config,
    trust_remote_code=True,

)

compiled_model = torch.compile(model)

def generate_completion_for_row(row: dict) -> dict:
    start = time.time()
    completion = generate_completions(tokenizer, model, [row['prompt']])[0]
    return {
        'generate_time': time.time()-start,
        'completion': completion,
    }

by_file_test_with_completions_qwen_ds = by_file_test_ds.map(generate_completion_for_row, num_proc=1)

by_file_test_with_completions_qwen_ds.save_to_disk(DATA_DIR+'/by_file_test_with_completions_qwen_ds')
del model, tokenizer

print(by_file_test_with_completions_qwen_ds)
by_file_test_with_completions_qwen_ds[0]

2025-05-21 21:59:09.477740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747853949.493300 2101766 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747853949.498069 2101766 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747853949.512365 2101766 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747853949.512383 2101766 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747853949.512384 2101766 computation_placer.cc:177] computation placer alr

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

In [4]:
# dophine

model_name = 'cognitivecomputations/Dolphin3.0-Llama3.2-1B'
#model_name = 'meta-llama/Llama-3.2-1B-Instruct'
#model_name = 'bigcode/starcoder2-3b'

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    #torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    #use_cache=False,
    #low_cpu_mem_usage=True,
    #local_files_only=True,
    # quantization_config=bnb_config,
    trust_remote_code=True,

)

compiled_model = torch.compile(model)

def generate_completion_for_row(row: dict) -> dict:
    start = time.time()
    completion = generate_completions(tokenizer, model, [row['prompt']])[0]
    return {
        'generate_time': time.time()-start,
        'completion': completion,
    }

by_file_test_with_completions_dophine_ds = by_file_test_ds.map(generate_completion_for_row, num_proc=1)

by_file_test_with_completions_dophine_ds.save_to_disk(DATA_DIR+'/by_file_test_with_completions_dophine_ds')
del model, tokenizer

print(by_file_test_with_completions_dophine_ds)
by_file_test_with_completions_dophine_ds[0]

tokenizer_config.json:   0%|          | 0.00/53.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/934 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]



Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

### Скорим результат

In [2]:
async def score_row(row: dict) -> dict:
    start = time.time()
    scorer = Scorer(row['project_path'], row['relative_package_path'], None, relative_file_path=row['relative_file_path'])

    evaluation_result = await scorer.score(row['completion'])

    reward = scorer.calculate_reward(evaluation_result)

    return {
        'score_time': time.time()-start,
        'result': evaluation_result,
        'reward': reward,
    }

In [3]:
by_file_test_with_completions_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_with_completions_ds')

In [None]:
sresults = {}
for i, row in tqdm(enumerate(by_file_test_with_completions_ds), total=len(by_file_test_with_completions_ds)):
    results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])] = await score_row(row)

by_file_test_scored_ds = by_file_test_with_completions_ds.map(lambda row: results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])], num_proc=1)

by_file_test_scored_ds.save_to_disk(DATA_DIR+'/by_file_test_scored_ds')

print(by_file_test_scored_ds)
by_file_test_scored_ds[0]               

  0%|          | 0/500 [00:00<?, ?it/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion', 'score_time', 'result', 'reward'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

In [3]:
by_file_test_with_completions_qwen_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_with_completions_qwen_ds')

In [4]:
results = {}
for i, row in tqdm(enumerate(by_file_test_with_completions_qwen_ds), total=len(by_file_test_with_completions_qwen_ds)):
    results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])] = await score_row(row)

by_file_test_scored_qwen_ds = by_file_test_with_completions_qwen_ds.map(lambda row: results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])], num_proc=1)

by_file_test_scored_qwen_ds.save_to_disk(DATA_DIR+'/by_file_test_scored_qwen_ds')

print(by_file_test_scored_qwen_ds)
by_file_test_scored_qwen_ds[0]               

  0%|          | 0/500 [00:00<?, ?it/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion', 'score_time', 'result', 'reward'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

In [None]:
by_file_test_with_completions_dophine_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_with_completions_dophine_ds')

In [8]:
results = {}
for i, row in tqdm(enumerate(by_file_test_with_completions_dophine_ds), total=len(by_file_test_with_completions_dophine_ds)):
    results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])] = await score_row(row)

by_file_test_scored_dophine_ds = by_file_test_with_completions_dophine_ds.map(lambda row: results[(row['project_path'], row['relative_package_path'], row['relative_file_path'])], num_proc=1)

by_file_test_scored_dophine_ds.save_to_disk(DATA_DIR+'/by_file_test_scored_dophine_ds')

print(by_file_test_scored_dophine_ds)
by_file_test_scored_dophine_ds[0]               

  0%|          | 0/500 [00:00<?, ?it/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt', 'generate_time', 'completion', 'score_time', 'result', 'reward'],
    num_rows: 500
})


{'project_path': '8844bcf1e44c80eb6c94e96c1c0466177f3bba94/',
 'relative_package_path': 'packages/testutil/testchain/',
 'relative_file_path': 'packages/testutil/testchain/mock_nodeconn.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package testchain\n\nimport (\n\t"github.com/iotaledger/goshimmer/packages/ledgerstate"\n)\n\ntype MockedNodeConn struct {\n\tid                              string\n\tonPullBacklog                   func(addr *ledgerstate.AliasAddress)\n\tonPullState                     func(addr *ledgerstate.AliasAddress)\n\tonPullConfirmedTransaction

### Анализ результатов

In [4]:
df1 = datasets.load_from_disk(DATA_DIR+'/by_file_test_scored_ds').to_pandas()
df1['model'] = 'deepseek'
df2 = datasets.load_from_disk(DATA_DIR+'/by_file_test_scored_qwen_ds').to_pandas()
df2['model'] = 'qwen'
df3 = datasets.load_from_disk(DATA_DIR+'/by_file_test_scored_dophine_ds').to_pandas()
df3['model'] = 'dophine'

df = pd.concat([df1, df2, df3])

In [5]:
df['coverage'] = df['result'].apply(lambda x: float(x['coverage']))
df['mutation_score'] = df['result'].apply(lambda x: float(x['mutation_score']))
df['error_type'] = df['result'].apply(lambda x: str(x['error_type']))
df['all_passed'] = df['result'].apply(lambda x: int(x['test_results']['all_passed']))
df['is_error'] = df['error_type'] != ''

In [7]:
print('error count', {model: len(df.query('model == "'+model+'" and is_error')) for model in ['deepseek', 'qwen', 'dophine']})

pd.DataFrame.from_dict({model: df.query('model == "'+model+'" and is_error')['error_type'].value_counts() for model in ['deepseek', 'qwen', 'dophine']})

error count {'deepseek': 370, 'qwen': 499, 'dophine': 412}


Unnamed: 0_level_0,deepseek,qwen,dophine
error_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completion_parse,133,499.0,289
get_deps,7,,2
go_tool_cover,34,,16
goimports,5,,16
other,21,,13
test_build_failed,170,,76


In [9]:
print('reward', df['reward'].mean())
print('coverage', df.query('is_error == False')['coverage'].mean())
print('mutation_score', df.query('all_passed == 1')['mutation_score'].mean())
print('all_passed_count', len(df.query('all_passed == 1')['mutation_score']))

pd.DataFrame.from_dict({model: {
    'reward': df.query('model == "'+model+'"')['reward'].mean(),
    'coverage': df.query('model == "'+model+'" and is_error == False')['coverage'].mean(),
    'mutation_score': df.query('model == "'+model+'" and all_passed == 1')['mutation_score'].mean(),
    'all_passed_count': len(df.query('model == "'+model+'" and all_passed == 1')),
} for model in ['deepseek', 'qwen', 'dophine']})

reward 0.062992721
coverage 13.200913242009133
mutation_score 0.2923137313432836
all_passed_count 67


Unnamed: 0,deepseek,qwen,dophine
reward,0.1772,0.0,0.011779
coverage,21.313846,0.0,1.365909
mutation_score,0.297437,,0.248403
all_passed_count,60.0,0.0,7.0
