# Подготовка датасета - 3 часть, генерация по функциям

### Импорты и сетап окружения

In [1]:
import datasets
import os
import pandas as pd
import subprocess
import re
from tqdm.notebook import tqdm
from pathlib import Path
import asyncio
import aiopath
import sys

ROOT_DIR = str(aiopath.AsyncPath.cwd().parent)
DATA_DIR = ROOT_DIR+'/data'
REPOS_DIR = DATA_DIR+'/repos'
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

from dataset.prompt import get_package_rows

gvm_root = os.environ['GVM_ROOT']
os.environ['PATH'] = f"{gvm_root}/bin:{gvm_root}/pkgsets/go1.24.2/global/bin:{gvm_root}/gos/go1.24.2/bin:{gvm_root}/pkgsets/go1.24.2/global/overlay/bin:{os.environ['PATH']}"

В части 1 [dataset_0](./dataset_0.ipynb) подготовлен датасет test_candidates_ds с отфильтрованными файлами

В части 2 генерация по файлам [dataset_1](./dataset_1.ipynb) исследован путь генерации тестов на основе исходных файлов, и он показал себя плохо.

Принято решение идти по другому пути, генерировать тесты для отдельных функций и собирать промпт из тестируемой функции и ее зависимостей

### Разбор пакетов go на функции и пересборка датасета, где каждый пример - отдельная функция

In [2]:
test_candidates_ds = datasets.load_from_disk(DATA_DIR+'/test_candidates_ds')

package_candidates = set()

for row in test_candidates_ds:
    package_candidates.add((row['project_path'], row['relative_go_package']))

def package_candidates_ds_row_generator():
    for (project_path, relative_go_package) in package_candidates:
        yield {"project_path": project_path, 'relative_go_package': relative_go_package}

package_candidates_ds = datasets.Dataset.from_generator(package_candidates_ds_row_generator)

print(package_candidates_ds)

package_candidates_ds[0]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['project_path', 'relative_go_package'],
    num_rows: 19804
})


{'project_path': '52bdf3787c11b0237f5e7864c98b1d0e75af9eb9/revision/',
 'relative_go_package': ''}

In [3]:
def transform_to_funcs(rows):
    columns = ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt']
    res = {column: [] for column in columns}

    for project_path, relative_go_package in zip(rows['project_path'], rows['relative_go_package']):
        items = []
        try:
            items = get_package_rows(project_path, relative_go_package)
        except Exception as e:
            pass

        for item in items:
            for column in columns:
                res[column].append(item[column])

    return res

final_ds = package_candidates_ds.map(transform_to_funcs, batched=True, batch_size=1, num_proc=6)

final_ds.save_to_disk(DATA_DIR+'/final_ds')

print(final_ds)
final_ds[0]

Map (num_proc=6):   0%|          | 0/19804 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/108461 [00:00<?, ? examples/s]

Dataset({
    features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
    num_rows: 108461
})


{'project_path': '52bdf3787c11b0237f5e7864c98b1d0e75af9eb9/revision/',
 'relative_go_package': '',
 'func_name': 'NewStack',
 'input_code': 'package main\n\nfunc NewStack() *Stack {\n\treturn &Stack{\n\t\tnil,\n\t\t0,\n\t}\n}\n\ntype Stack struct {\n\thead   *Node\n\tLength int\n}\n\n',
 'prompt': [{'content': '\nYou are an expert programmer. \nYou should only return output test file containing working code.\nThe user is going to give you code and would like to have unit tests for for first function.\nAll the other functions are just dependencies to give you context of all the possible test cases to produce.\nCover all possible inputs and their respective outputs using tests.\nEach subtest must be wrapped into t.Run.\n',
   'role': 'system'},
  {'content': 'write unit tests for function NewStack:\n```go\npackage main\n\nfunc NewStack() *Stack {\n\treturn &Stack{\n\t\tnil,\n\t\t0,\n\t}\n}\n\ntype Stack struct {\n\thead   *Node\n\tLength int\n}\n\n```',
   'role': 'user'}]}

### Разделяем датасет на валидационную и тренировочную выборки

In [None]:
final_ds = datasets.load_from_disk(DATA_DIR+'/final_ds')

final_ds

In [9]:
splitted_ds = final_ds.train_test_split(test_size=0.2)

splitted_ds.save_to_disk(DATA_DIR+'/splitted_ds')

splitted_ds

Saving the dataset (0/1 shards):   0%|          | 0/208976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52245 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
        num_rows: 208976
    })
    test: Dataset({
        features: ['project_path', 'relative_go_package', 'func_name', 'input_code', 'prompt'],
        num_rows: 52245
    })
})

In [5]:
splitted_ds = datasets.load_from_disk(DATA_DIR+'/splitted_ds')

splitted_ds['train'].take(10).to_pandas()

Unnamed: 0,project_path,relative_go_package,func_name,input_code,prompt
0,71f5e97beef472b2013d0a7f5f822ab9ee27957a/modul...,pkg/resource/statefulset/,GetStatefulSetPods,"package statefulset\n\nimport (\n\t""k8s.io/das...",[{'content': ' You are an expert programmer. ...
1,806f312c0ddddb3c1d9a418a3b88e78170b7bd5d/,pkg/sql/plan/,buildCreateView,package plan\n\nfunc buildCreateView(stmt *tre...,[{'content': ' You are an expert programmer. ...
2,02d5139081ebd4d150ec15b476d20543200e9d8e/,cmd/build-oss-fuzz-corpus/,findJSONFilesInDir,"package main\n\nimport (\n\t""encoding/json""\n\...",[{'content': ' You are an expert programmer. ...
3,959dce294c0a43b675f80419f7189393221613d4/pkg/i...,vendor/google.golang.org/grpc/,newPickfirstBuilder,"package grpc\n\nimport (\n\t""google.golang.org...",[{'content': ' You are an expert programmer. ...
4,e6be7abcae9500f8a51d9d601ad181d7f7b4bae9/,pkg/server/,NewCorruptionChecker,"package server\n\nimport (\n\t""github.com/code...",[{'content': ' You are an expert programmer. ...
5,b0d4f27aeddc5b075275fcad2c8da92187025a8b/,pkg/client/,parsePath,"package client\n\nimport (\n\t""runtime""\n\t""pa...",[{'content': ' You are an expert programmer. ...
6,e6be7abcae9500f8a51d9d601ad181d7f7b4bae9/,pkg/client/cache/,NewHistoryFileCache,"package cache\n\nimport (\n\t""github.com/coden...",[{'content': ' You are an expert programmer. ...
7,ececf7638e93170cf76966d22bc1ca8a8fdc518a/,example/,sleepHandler,"package main\n\nimport (\n\t""time""\n\t""net/htt...",[{'content': ' You are an expert programmer. ...
8,be85be2aa85821c6f67bd1001505061e007f187f/,pkg/controllers/deploy/,SetPrecheckFunc,"package deploy\n\nimport (\n\t""sigs.k8s.io/con...",[{'content': ' You are an expert programmer. ...
9,e32bdc053f64efc78a9f1fc3ac25cd8af6737425/,providers/softlayer/,init,"package softlayer\n\nimport (\n\t""github.com/S...",[{'content': ' You are an expert programmer. ...
