# Подготовка датасета - 2 часть, генерация по файлам

### Импорты и сетап окружения

In [2]:
import datasets
import os
import pandas as pd
import subprocess
import re
from tqdm.notebook import tqdm
from pathlib import Path
import asyncio
import aiopath
import sys
import time

ROOT_DIR = str(aiopath.AsyncPath.cwd().parent)
DATA_DIR = ROOT_DIR+'/data'
REPOS_DIR = DATA_DIR+'/repos'
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

from evaluating.generator import make_model_with_tokenizer, generate_completions
from scoring.scoring import Scorer

gvm_root = os.environ['GVM_ROOT']
os.environ['PATH'] = f"{gvm_root}/bin:{gvm_root}/pkgsets/go1.24.2/global/bin:{gvm_root}/gos/go1.24.2/bin:{gvm_root}/pkgsets/go1.24.2/global/overlay/bin:{os.environ['PATH']}"

В части 1 [dataset_0](./dataset_0.ipynb) подготовлен датасет test_candidates_ds с отфильтрованными файлами

In [3]:
test_candidates_ds = datasets.load_from_disk(DATA_DIR+'/test_candidates_ds')

### Записываем промпты в датасет

In [4]:
system_message = """
You are an expert programmer. 
You should only return output test file containing working code.
The user is going to give you code and would like to have unit tests for the first file.
All the other files are just dependencies to give you context of all the possible test cases to produce.
Cover all possible inputs and their respective outputs using tests.
Each subtest must be wrapped into t.Run
"""

def get_prompt(row) -> str:
    file_path = '../data/repos/'+row['project_path']+row['relative_project_path']
    f = open(file_path, 'r')
    file_content = f.read()
    f.close()

    prompt = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"{file_content}"}
    ]

    return prompt

def finalize_row(row) -> dict:
    return {
        'project_path': row['project_path'],
        'relative_package_path': row['relative_go_package'],
        'relative_file_path': row['relative_project_path'],
        'prompt': get_prompt(row)
    }

by_file_ds = test_candidates_ds.map(finalize_row, num_proc=32).select_columns(['project_path', 'relative_package_path', 'relative_file_path', 'prompt'])

by_file_ds.save_to_disk('../data/by_file_ds')

print(by_file_ds)
by_file_ds[0]

Map (num_proc=32):   0%|          | 0/33251 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33251 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt'],
    num_rows: 33251
})


{'project_path': '766dc882d779f07821bde740ce49802f67ae42b3/backend/',
 'relative_package_path': 'controllers/',
 'relative_file_path': 'controllers/controllers.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package controllers\n\nimport (\n\t"encoding/json"\n\t"fmt"\n\techo "github.com/labstack/echo/v4"\n\t"golang.org/x/net/websocket"\n\t"stream/models"\n)\n\n// Controller interface has two methods\ntype Controller interface {\n\t// Homecontroller renders initial home page\n\tHomeController(e echo.Context) error\n\n\t// StreamController responds with live cpu statu

### Генерируем completions исходной моделью `deepseek-ai/deepseek-coder-1.3b-instruct`

In [5]:
by_file_ds = datasets.load_from_disk(DATA_DIR+'/by_file_ds')

In [6]:
by_file_test_ds = by_file_ds.shuffle().take(500)

by_file_test_ds.save_to_disk('../data/by_file_test_ds')

print(by_file_test_ds)
by_file_test_ds[0]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_package_path', 'relative_file_path', 'prompt'],
    num_rows: 500
})


{'project_path': '36585158f195b346828ce1ab8ccd693e7fc576ef/',
 'relative_package_path': 'builder/filter/',
 'relative_file_path': 'builder/filter/javascript.go',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for the first file.\nAll the other files are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run\n',
   'role': 'system'},
  {'content': 'package filter\n\nimport (\n\t"encoding/json"\n\n\t"github.com/grafadruid/go-druid/builder"\n\t"github.com/grafadruid/go-druid/builder/extractionfn"\n)\n\ntype Javascript struct {\n\tBase\n\tDimension    string               `json:"dimension,omitempty"`\n\tFunction     string               `json:"function,omitempty"`\n\tExtractionFn builder.ExtractionFn `json:"extr

In [7]:
by_file_test_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_ds')

In [10]:
tokenizer, model = make_model_with_tokenizer('original')

def generate_completion_for_row(row: dict) -> dict:
    start = time.time()
    completion = generate_completions(tokenizer, model, [row['prompt']])[0]
    return {
        'generate_time': time.time()-start,
        'completion': completion,
    }

by_file_test_with_completions_ds = by_file_test_ds.map(generate_completion_for_row, num_proc=1)

by_file_test_with_completions_ds.save_to_disk(DATA_DIR+'/by_file_test_with_completions_ds')
del model, tokenizer

print(by_file_test_with_completions_ds)
by_file_test_with_completions_ds[0]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

KeyboardInterrupt: 

### Скорим результат

In [None]:
by_file_test_with_completions_ds = datasets.load_from_disk(DATA_DIR+'/by_file_test_with_completions_ds')

In [None]:
async def score_row(row: dict) -> dict:
    start = time.time()
    scorer = Scorer(row['project_path'], row['relative_go_package'], row['func_name'])

    evaluation_result = await scorer.score(row['completion'])

    reward = scorer.calculate_reward(evaluation_result)

    return {
        'score_time': time.time()-start,
        'result': evaluation_result,
        'reward': reward,
    }

by_file_test_scored_ds = by_file_test_with_completions_ds.map(score_row, num_proc=1)

by_file_test_scored_ds.save_to_disk(DATA_DIR+'/by_file_test_scored_ds')

print(by_file_test_scored_ds)
by_file_test_scored_ds[0]               

Saving the dataset (0/1 shards):   0%|          | 0/208976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52245 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
        num_rows: 208976
    })
    test: Dataset({
        features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
        num_rows: 52245
    })
})

### Анализ результатов

In [None]:
df = datasets.load_from_disk(DATA_DIR+'/by_file_test_scored_ds').to_pandas()

In [None]:
df['coverage'] = df['result'].apply(lambda x: float(x['coverage']))
df['mutation_score'] = df['result'].apply(lambda x: float(x['mutation_score']))
df['error_type'] = df['result'].apply(lambda x: str(x['error_type']))
df['all_passed'] = df['result'].apply(lambda x: int(x['test_results']['all_passed']))
df['is_error'] = df['error_type'] != ''

Dataset({
    features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
    num_rows: 261221
})

In [None]:
print('error count', len(df.query('is_error')))

df.query('is_error')['error_type'].value_counts()

Unnamed: 0,project_path,relative_go_package,func_name,input_code,prompt
0,71f5e97beef472b2013d0a7f5f822ab9ee27957a/modul...,pkg/resource/statefulset/,GetStatefulSetPods,"package statefulset\n\nimport (\n\t""k8s.io/das...",[{'content': ' You are an expert programmer. ...
1,806f312c0ddddb3c1d9a418a3b88e78170b7bd5d/,pkg/sql/plan/,buildCreateView,package plan\n\nfunc buildCreateView(stmt *tre...,[{'content': ' You are an expert programmer. ...
2,02d5139081ebd4d150ec15b476d20543200e9d8e/,cmd/build-oss-fuzz-corpus/,findJSONFilesInDir,"package main\n\nimport (\n\t""encoding/json""\n\...",[{'content': ' You are an expert programmer. ...
3,959dce294c0a43b675f80419f7189393221613d4/pkg/i...,vendor/google.golang.org/grpc/,newPickfirstBuilder,"package grpc\n\nimport (\n\t""google.golang.org...",[{'content': ' You are an expert programmer. ...
4,e6be7abcae9500f8a51d9d601ad181d7f7b4bae9/,pkg/server/,NewCorruptionChecker,"package server\n\nimport (\n\t""github.com/code...",[{'content': ' You are an expert programmer. ...
5,b0d4f27aeddc5b075275fcad2c8da92187025a8b/,pkg/client/,parsePath,"package client\n\nimport (\n\t""runtime""\n\t""pa...",[{'content': ' You are an expert programmer. ...
6,e6be7abcae9500f8a51d9d601ad181d7f7b4bae9/,pkg/client/cache/,NewHistoryFileCache,"package cache\n\nimport (\n\t""github.com/coden...",[{'content': ' You are an expert programmer. ...
7,ececf7638e93170cf76966d22bc1ca8a8fdc518a/,example/,sleepHandler,"package main\n\nimport (\n\t""time""\n\t""net/htt...",[{'content': ' You are an expert programmer. ...
8,be85be2aa85821c6f67bd1001505061e007f187f/,pkg/controllers/deploy/,SetPrecheckFunc,"package deploy\n\nimport (\n\t""sigs.k8s.io/con...",[{'content': ' You are an expert programmer. ...
9,e32bdc053f64efc78a9f1fc3ac25cd8af6737425/,providers/softlayer/,init,"package softlayer\n\nimport (\n\t""github.com/S...",[{'content': ' You are an expert programmer. ...


In [None]:
print('reward', df['reward'].mean())
print('coverage', df.query('is_error == False')['coverage'].mean())
print('mutation_score', df.query('all_passed == 1')['mutation_score'].mean())