# Подготовка датасета - 1 часть, скачивание и фильтрация файлов

### Импорты и сетап окружения

In [1]:
import datasets
import os
import pandas as pd
import subprocess
import re
from tqdm.notebook import tqdm
from pathlib import Path
import asyncio
import aiopath
import sys

ROOT_DIR = str(aiopath.AsyncPath.cwd().parent)
DATA_DIR = ROOT_DIR+'/data'
REPOS_DIR = DATA_DIR+'/repos'
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

gvm_root = os.environ['GVM_ROOT']
os.environ['PATH'] = f"{gvm_root}/bin:{gvm_root}/pkgsets/go1.24.2/global/bin:{gvm_root}/gos/go1.24.2/bin:{gvm_root}/pkgsets/go1.24.2/global/overlay/bin:{os.environ['PATH']}"

### Скачиваем исходный dataset

В качестве индекса репозиториев гитхаба используется датасет `bigcode/the-stack-v2-dedup`, содержащий 8.25 млн записей для языка go (`.go`, `go.mod`, `go.sum`)

In [3]:
# Go 7.84M
# Go_Checksums 62.4K
# Go_Module 351K

if not os.path.isdir(DATA_DIR+'/source_dedup_ds'):
    go_ds = datasets.load_dataset("bigcode/the-stack-v2-dedup", "Go", split="train")
    go_checksums_ds = datasets.load_dataset("bigcode/the-stack-v2-dedup", "Go_Checksums", split="train")
    go_module_ds = datasets.load_dataset("bigcode/the-stack-v2-dedup", "Go_Module", split="train")
    source_ds = datasets.concatenate_datasets([
        go_ds,
        go_checksums_ds,
        go_module_ds,
    ])
    del go_ds, go_checksums_ds, go_module_ds

    source_ds = source_ds.remove_columns([
        'blob_id', 'directory_id', 'content_id', 'detected_licenses', 'license_type',
        'snapshot_id', 'branch_name', 'visit_date', 'revision_date', 'committer_date',
        'github_id', 'gha_license_id', 'gha_event_created_at', 'gha_created_at',
        'gha_language', 'src_encoding', 'language',
    ])
    source_ds = source_ds.sort(["repo_name", "path"])
    source_ds.save_to_disk(DATA_DIR+'/source_dedup_ds')

source_ds = datasets.load_from_disk(DATA_DIR+'/source_dedup_ds')

print(len(source_ds))
source_ds.take(5).to_pandas()

8248983


Unnamed: 0,path,repo_name,revision_id,star_events_count,fork_events_count,is_vendor,is_generated,length_bytes,extension,filename
0,/backend/cmd/main.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,118,go,main.go
1,/backend/controllers/controllers.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,1164,go,controllers.go
2,/backend/go.mod,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,126,mod,go.mod
3,/backend/models/models.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,767,go,models.go
4,/backend/routes/router.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,233,go,router.go


### Подготовка и фильтрация данных

Берем 100000 файлов из самых популярных репозиториев

In [4]:
source_ds = source_ds.sort(['star_events_count', 'repo_name', 'path'])

source_ds = source_ds.take(500_000)

print(len(source_ds))
source_ds.take(5).to_pandas()

500000


Unnamed: 0,path,repo_name,revision_id,star_events_count,fork_events_count,is_vendor,is_generated,length_bytes,extension,filename
0,/backend/cmd/main.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,118,go,main.go
1,/backend/controllers/controllers.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,1164,go,controllers.go
2,/backend/go.mod,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,126,mod,go.mod
3,/backend/models/models.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,767,go,models.go
4,/backend/routes/router.go,0-10V/real-time-metrics,766dc882d779f07821bde740ce49802f67ae42b3,0,0,False,False,233,go,router.go


In [5]:
repo_names = set()

for row in source_ds:
    repo_names.add((row['repo_name'], row['revision_id']))

def repo_names_generator():
    for (repo_name, revision_id) in repo_names:
        yield {'repo_name': repo_name, 'revision_id': revision_id}

repo_names_ds = datasets.Dataset.from_generator(repo_names_generator)
del repo_names

repo_names_ds.save_to_disk(DATA_DIR+'/repo_names_ds')

print(len(repo_names_ds))
repo_names_ds.take(5).to_pandas()

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/66857 [00:00<?, ? examples/s]

66857


Unnamed: 0,repo_name,revision_id
0,JonCooperWorks/dnsd,811d12ea9e546ba6e2e17f34f4130f5876b88fbb
1,Avatarchik/assetview,5ad9636cc8e40b5495208185922c83fdf2d1e65f
2,BlueXDog/gochat2,f67ed2b90ae0395d1ed015633667dcbbe869e4b0
3,DasonYang/mmh3_128_string,03f97a17300bf1b183af6e1c67c3efbbd3e8a478
4,1Crazymoney/TestSafeHarborServer,e64c6d5a40cbcda6d4e24babd3785b93dd96ca69


### Выгрузка архивов репозиториев с github.com

Дальше нужно выкачать архивы с репозиториями github и распаковать их

Код для этого код вынесен в отдельный скрипт `scrap_dataset.py`.

Запускаем
`python scrap_dataset.py`

### Проверяем наличие файлов из индекса на диске

In [8]:
import hashlib

if not os.path.isdir(DATA_DIR+'/files_ds'):

    def check_file(row):
        download_path = f"{REPOS_DIR}/{row['revision_id']}{row['path']}"    

        try:
            with open(download_path, "rb") as f:
                digest = hashlib.file_digest(f, "sha256")
                hash = digest.hexdigest()

                return {'file_exists': True, "file_hash": hash}
        except OSError:
            return {'file_exists': False, "file_hash": ""}

    files_ds = source_ds.sort(['star_events_count', 'repo_name', 'path'], reverse=[True, False, False]).map(check_file, num_proc=32)

    files_ds.save_to_disk(DATA_DIR+'/files_ds')

files_ds = datasets.load_from_disk(DATA_DIR+'/files_ds')

print(files_ds)
files_ds[0]

Map (num_proc=32):   0%|          | 0/500000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500000 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash'],
    num_rows: 500000
})


{'path': '/backend/cmd/main.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 118,
 'extension': 'go',
 'filename': 'main.go',
 'file_exists': True,
 'file_hash': '7eecb166bd857daa233eff94b56109f2cef0a8d01c0e1438c698f53de3bf930c'}

### Очистим тестовые файлы в проектах

Такие файлы не нужны для выбранного метода обучения

In [None]:
revisions = files_ds.unique('revision_id')
print(len(revisions))
print(revisions[0])

for revision_id in tqdm(revisions):
    project_path = REPOS_DIR+'/'+revision_id
    for p in Path(project_path).glob("**/*_test.go"):
        if p.is_file():
            p.unlink()

### Отфильтруем из датасета отсутствующие файлы, с именем, отличающимися от (`*.go`, `go.mod`, `go.sum`)

In [10]:
filter_stats = {
    "not_exists_count": 0,
    "test_files_count": 0,
    "wrong_ext_count": 0,
}

def filter_files(row) -> bool:
    if not row['file_exists']:
        filter_stats['not_exists_count'] += 1
        return False

    lower_path = row['path'].lower()
    if lower_path.endswith("_test.go"):
        filter_stats['test_files_count'] += 1
        return False
    if not lower_path.endswith(".go") and not lower_path.endswith("/go.mod") and not lower_path.endswith("/go.sum"):
        filter_stats['wrong_ext_count'] += 1
        return False

    return True

ds = files_ds.filter(filter_files, num_proc=1)

print(ds)
filter_stats

Filter:   0%|          | 0/500000 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash'],
    num_rows: 414960
})


{'not_exists_count': 47489, 'test_files_count': 37551, 'wrong_ext_count': 0}

### Проанализируем структуру репозиториев

Определим project (директория внутри репозитория с `go.mod`), в одном репозитории может быть несколько проектов.

Определим относительный путь файла в проекте

Определим относительный путь пакета файла (директория внутри проекта)

In [11]:
projects_by_repo: dict[str, set[str]] = {}

def get_go_package(relative_project_path: str) -> str:
    filename_slash_pos = relative_project_path.rfind('/')
    return relative_project_path[:filename_slash_pos+1]

def collect_projects(row):
    lower_path = row['path'].lower()
    if lower_path.endswith("/go.mod"):
        project = row['path'][:len(row['path'])-6]

        repo_projects = projects_by_repo.get(row['repo_name'], set())
        repo_projects.add(project)
        projects_by_repo[row['repo_name']] = repo_projects

ds.map(collect_projects)

def add_project_column(row):
    lower_path = row['path'].lower()
    if lower_path.endswith("/go.mod"):
        project = row['path'][:len(row['path'])-6]
        return {'project': project, 'relative_project_path': row['path'][len(row['path'])-7:], 'relative_go_package': ''}

    repo_projects = list(projects_by_repo.get(row["repo_name"], set()))
    repo_projects.sort(key=len, reverse=True)

    for repo_project in repo_projects:
        if ((lower_path.endswith('.go') and row['path'].startswith(repo_project)) or
            (lower_path == repo_project.lower()+"go.sum")):

            relative_project_path = row['path'][len(repo_project):]
            return {'project': repo_project, 'relative_project_path': relative_project_path, 'relative_go_package': get_go_package(relative_project_path)}

    return {'project': '', 'relative_project_path': '', 'relative_go_package': ''}

ds = ds.map(add_project_column, num_proc=32)

print(ds)
ds[0]

Map:   0%|          | 0/414960 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/414960 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package'],
    num_rows: 414960
})


{'path': '/backend/cmd/main.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 118,
 'extension': 'go',
 'filename': 'main.go',
 'file_exists': True,
 'file_hash': '7eecb166bd857daa233eff94b56109f2cef0a8d01c0e1438c698f53de3bf930c',
 'project': '/backend/',
 'relative_project_path': 'cmd/main.go',
 'relative_go_package': 'cmd/'}

### Отфильтруем файлы go без go module

Проекты без go.mod - это legacy проекты, который с большой вероятностью не получится запустить без ручной работы

In [12]:
ds = ds.filter(lambda row: row['project'] != '', num_proc=32)

print(ds)
ds.skip(100)[0]

Filter (num_proc=32):   0%|          | 0/414960 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package'],
    num_rows: 170126
})


{'path': '/basic/api/prod/prods.pb.go',
 'repo_name': '00arthur00/micro-simple-study',
 'revision_id': '6f4a3acac7dbac1fb7c1377862da1abf3a989f2f',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': True,
 'length_bytes': 5956,
 'extension': 'go',
 'filename': 'prods.pb.go',
 'file_exists': True,
 'file_hash': 'ea093c07732747284b6468dc0dc3f12612d26716c95e377f479a9531f5232aa5',
 'project': '/basic/',
 'relative_project_path': 'api/prod/prods.pb.go',
 'relative_go_package': 'api/prod/'}

TODO: отфильтровать build теги `//go:build ...`

### Отфильтруем проекты с /generated/ в пути

Это с большой вероятностью означает, что они сгенерированы

In [None]:
ds = ds.filter(lambda row: '/generated/' not in '/'+row['relative_go_package'], num_proc=32)

print(ds)

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package'],
    num_rows: 2664663
})


In [None]:
if not os.path.isdir(DATA_DIR+'/before_files_funcs_ds'):
    ds.save_to_disk(DATA_DIR+'/before_files_funcs_ds')

ds = datasets.load_from_disk(DATA_DIR+'/before_files_funcs_ds')

print(ds)
ds[0]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package'],
    num_rows: 2664663
})


{'path': '/go.mod',
 'repo_name': 'avelino/awesome-go',
 'revision_id': 'c3643eb9da5c673101f8fe15a6deb40bfc4a1c85',
 'star_events_count': 112752,
 'fork_events_count': 13739,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 603,
 'extension': 'mod',
 'filename': 'go.mod',
 'file_exists': True,
 'file_hash': '0adeb56bf0ed5a68b4e0ac51da9f3d38e6623754e262f9e4cd1a9f7c4c5ce817',
 'project': '/',
 'relative_project_path': '/go.mod',
 'relative_go_package': ''}

### Отфильтруем файлы без функций

Это могут быть константы, структуры без методов, которые не нужно тестить

In [15]:
if not os.path.isdir(DATA_DIR+'/files_funcs_ds'):

    def extract_funcs(row):
        download_path = f"{REPOS_DIR}/{row['revision_id']}{row['path']}"

        if row['extension'] != 'go':
            return {'funcs': '', 'methods': ''}

        try:
            with open(download_path, "rb") as f:
                content = f.read().decode('utf-8')
                test_fns: list[str] = re.findall(r'\nfunc\W*(\w+)\W*\(', content)
                test_methods: list[str] = re.findall(r'\nfunc\W*\(\w+ +(\*?\w+)\)\W*(\w+)\W*\(', content)
                test_methods_joined = [t+'.'+m for (t, m) in test_methods]

                return {'funcs': ','.join(test_fns), 'methods': ','.join(test_methods_joined)}
        except Exception as e:
            return {'funcs': '', 'methods': ''}

    ds = ds.map(extract_funcs, num_proc=16)

    ds.save_to_disk(DATA_DIR+'/files_funcs_ds')

ds = datasets.load_from_disk(DATA_DIR+'/files_funcs_ds')

print(ds)
ds[0]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package', 'funcs', 'methods'],
    num_rows: 169891
})


{'path': '/backend/cmd/main.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 118,
 'extension': 'go',
 'filename': 'main.go',
 'file_exists': True,
 'file_hash': '7eecb166bd857daa233eff94b56109f2cef0a8d01c0e1438c698f53de3bf930c',
 'project': '/backend/',
 'relative_project_path': 'cmd/main.go',
 'relative_go_package': 'cmd/',
 'funcs': 'main',
 'methods': ''}

In [16]:
ds = ds.filter(lambda row: row['extension'] != 'go' or row['funcs'] != ''
    # or row['methods'] != '' # Отфильтруем файлы с методами типов. Маленькая вероятность генерации успешных тестов для таких методов, так как нужно обеспечить работу моков
)

print(ds)
ds[0]

Dataset({
    features: ['path', 'repo_name', 'revision_id', 'star_events_count', 'fork_events_count', 'is_vendor', 'is_generated', 'length_bytes', 'extension', 'filename', 'file_exists', 'file_hash', 'project', 'relative_project_path', 'relative_go_package', 'funcs', 'methods'],
    num_rows: 132771
})


{'path': '/backend/cmd/main.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 118,
 'extension': 'go',
 'filename': 'main.go',
 'file_exists': True,
 'file_hash': '7eecb166bd857daa233eff94b56109f2cef0a8d01c0e1438c698f53de3bf930c',
 'project': '/backend/',
 'relative_project_path': 'cmd/main.go',
 'relative_go_package': 'cmd/',
 'funcs': 'main',
 'methods': ''}

### Скачивание зависимостей проектов

In [17]:
ds = ds.filter(lambda row: row['extension'] == 'go')
ds = ds.map(lambda row: {'project_path': row['revision_id']+row['project']})

Filter:   0%|          | 0/132771 [00:00<?, ? examples/s]

Map:   0%|          | 0/106864 [00:00<?, ? examples/s]

Зависимости слишком тяжелый для скачивания во всех репозиториях, возьмем 100k файлов из самых популярных репозиториев

In [18]:
ds = ds.take(50_000)

In [19]:
project_paths = ds.unique('project_path')

print(len(project_paths))
project_paths[0]

8243


'766dc882d779f07821bde740ce49802f67ae42b3/backend/'

In [20]:
project_download_deps_errors: dict[str, str] = {}

concurrency = 32
next_index = 0
last_error = ''
updated_count = 0

async def worker(pbar, i):
    global next_index
    global last_error
    global updated_count
    while next_index < len(project_paths):
        project_path = project_paths[next_index]
        next_index += 1

        go_mod_content = ''
        try:
            go_mod_content = await aiopath.AsyncPath(f"{REPOS_DIR}/{project_path}go.mod").read_text()
        except Exception:
            project_download_deps_errors[project_path] = 'go.mod not found'
            last_error = 'go.mod not found'
            pbar.update(1)
            pbar.set_postfix(errors_count=len(project_download_deps_errors), updated_count=updated_count, last_error=last_error)
            continue

        async for p in aiopath.AsyncPath(f"{REPOS_DIR}/{project_path}").glob("**/*_test.go"):
            await p.unlink()

        update_libs = ['github.com/!azure/azure-sdk-for-go', 'github.com/aws/aws-sdk-go-v2', 'github.com/aws/aws-sdk-go',
                       'google.golang.org/genproto', 'google.golang.org/api', 'google.golang.org/grpc', 'google.golang.org/protobuf']
        presented_libs = [lib for lib in update_libs if lib+' ' in go_mod_content]
        if len(presented_libs) > 0:
            libs_str = ' '.join([lib+'@latest' if lib != 'github.com/!azure/azure-sdk-for-go' else 'github.com/\!azure/azure-sdk-for-go' for lib in presented_libs])
            proc = await asyncio.create_subprocess_shell(
                "go get -u "+libs_str,
                cwd=f"{REPOS_DIR}/{project_path}",
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE)

            stdout, stderr = await proc.communicate()
            
            if proc.returncode != 0:
                project_download_deps_errors[project_path] = 'go get -u: '+stderr.decode()
                last_error = 'go get -u: '+stderr.decode()[:100]

                pbar.update(1)
                pbar.set_postfix(errors_count=len(project_download_deps_errors), updated_count=updated_count, last_error=last_error)

            updated_count += 1
        

        proc = await asyncio.create_subprocess_shell(
            "go mod tidy",
            cwd=f"{REPOS_DIR}/{project_path}",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE)

        stdout, stderr = await proc.communicate()
        
        if proc.returncode != 0:
            project_download_deps_errors[project_path] = stderr
            last_error = stderr.decode()[:100]

        pbar.update(1)
        pbar.set_postfix(errors_count=len(project_download_deps_errors), updated_count=updated_count, last_error=last_error)



with tqdm(total=len(project_paths)) as pbar:
    features = [worker(pbar, i) for i in range(concurrency)]

    await asyncio.gather(*features)

len(project_download_deps_errors)

  0%|          | 0/8243 [00:00<?, ?it/s]

693

In [27]:
{k: project_download_deps_errors[k] for k in list(project_download_deps_errors)[:10]}

{'f431ca5434407961c5bd08c1c68a8b6a4b1b3f66/': b'go: github.com/go-exec/exec@v0.0.0-20190715174909-f3ac22ac3ec0 (replaced by ../exec): reading ../exec/go.mod: open /media/hdd_1/vkr/data/repos/exec/go.mod: no such file or directory\n',
 '6f4a3acac7dbac1fb7c1377862da1abf3a989f2f/micro/': b"go: github.com/micro/go-micro@v1.18.0 requires\n\tgithub.com/micro/protoc-gen-micro@v1.0.0: reading github.com/micro/protoc-gen-micro/go.mod at revision v1.0.0: git ls-remote -q origin in /home/djvue/.gvm/pkgsets/go1.24.2/global/pkg/mod/cache/vcs/28cfbca4f5e41691076780b2017d5a1000b007b1d3d37b7a9d3b562c7adc9377: exit status 128:\n\tfatal: could not read Username for 'https://github.com': terminal prompts disabled\nConfirm the import path was entered correctly.\nIf this is a private repository, see https://golang.org/doc/faq#git_https for additional information.\n",
 '6f4a3acac7dbac1fb7c1377862da1abf3a989f2f/basic/': b"go: github.com/micro/protoc-gen-micro/v2@v2.0.0: reading github.com/micro/protoc-gen-mi

### Проверка валидности исходных файлов по go пакетам

In [31]:
go_project_packages: set[(str, str)] = set()
for row in ds:
    if row['project_path'] in project_download_deps_errors:
        continue
    go_project_packages.add((row['project_path'], row['relative_go_package']))

go_project_packages = list(go_project_packages)

len(go_project_packages)

24169

In [None]:
compile_project_package_errors: dict[tuple[str, str], str] = {}
compile_project_package_no_error: set[tuple[str, str]] = set()

In [36]:
concurrency = 11
next_index = 0
last_error = ''

async def worker(pbar, i):
    global next_index
    global last_error
    while next_index < len(go_project_packages):
        (project_path, relative_go_package) = go_project_packages[next_index]
        next_index += 1

        if (project_path, relative_go_package) in compile_project_package_no_error:
            pbar.update(1)
            continue

        proc = await asyncio.create_subprocess_shell(
            "go build -o /dev/null ./"+relative_go_package,
            cwd=f"{REPOS_DIR}/{project_path}",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE)

        stdout, stderr = await proc.communicate()
        
        if proc.returncode != 0:
            compile_project_package_errors[(project_path, relative_go_package)] = stderr
            last_error = stderr.decode()[:100]
        else:
            compile_project_package_no_error.add((project_path, relative_go_package))

        pbar.update(1)
        pbar.set_postfix(errors_count=len(compile_project_package_errors), last_error=last_error)


with tqdm(total=len(go_project_packages)) as pbar:
    features = [worker(pbar, i) for i in range(concurrency)]

    await asyncio.gather(*features)

len(compile_project_package_errors)
    

  0%|          | 0/24169 [00:00<?, ?it/s]

2606

In [37]:
ds = ds.filter(lambda row: row['project_path'] not in project_download_deps_errors and (row['project_path'], row['relative_go_package']) not in compile_project_package_errors)

print(len(ds))
ds[0]

Filter:   0%|          | 0/40285 [00:00<?, ? examples/s]

36687


{'path': '/backend/cmd/main.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 118,
 'extension': 'go',
 'filename': 'main.go',
 'file_exists': True,
 'file_hash': '7eecb166bd857daa233eff94b56109f2cef0a8d01c0e1438c698f53de3bf930c',
 'project': '/backend/',
 'relative_project_path': 'cmd/main.go',
 'relative_go_package': 'cmd/',
 'funcs': 'main',
 'methods': '',
 'project_path': '766dc882d779f07821bde740ce49802f67ae42b3/backend/'}

In [38]:
list(compile_project_package_errors.items())[:10]

[(('61f270ea4ca54163f8507f526e20cae2fd1ae79d/src/go/k8s/',
   'apis/redpanda/v1alpha1/'),
  b'# sigs.k8s.io/controller-runtime/pkg/log\n/home/djvue/.gvm/pkgsets/go1.24.2/global/pkg/mod/sigs.k8s.io/controller-runtime@v0.7.2/pkg/log/log.go:67:16: cannot use NullLogger{} (value of struct type NullLogger) as logr.Logger value in argument to Log.Fulfill\n/home/djvue/.gvm/pkgsets/go1.24.2/global/pkg/mod/sigs.k8s.io/controller-runtime@v0.7.2/pkg/log/log.go:82:31: cannot use NullLogger{} (value of struct type NullLogger) as logr.Logger value in argument to NewDelegatingLogger\n/home/djvue/.gvm/pkgsets/go1.24.2/global/pkg/mod/sigs.k8s.io/controller-runtime@v0.7.2/pkg/log/log.go:86:24: cannot use Log (variable of type *DelegatingLogger) as logr.Logger value in variable declaration\n/home/djvue/.gvm/pkgsets/go1.24.2/global/pkg/mod/sigs.k8s.io/controller-runtime@v0.7.2/pkg/log/log.go:88:16: assignment mismatch: 1 variable but logr.FromContext returns 2 values\n/home/djvue/.gvm/pkgsets/go1.24.2/glo

In [39]:
test_candidates_ds = ds.filter(lambda row: not row['is_generated']
                               and not row['is_vendor']
                               and row['path'].lower().endswith('.go')
                               and (row['filename'] != 'main.go' or row['funcs'] != 'main' or row['methods'] != ''))

test_candidates_ds.save_to_disk(DATA_DIR+'/test_candidates_ds')

print(len(test_candidates_ds))
test_candidates_ds[0]

Filter:   0%|          | 0/36687 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33251 [00:00<?, ? examples/s]

33251


{'path': '/backend/controllers/controllers.go',
 'repo_name': '0-10V/real-time-metrics',
 'revision_id': '766dc882d779f07821bde740ce49802f67ae42b3',
 'star_events_count': 0,
 'fork_events_count': 0,
 'is_vendor': False,
 'is_generated': False,
 'length_bytes': 1164,
 'extension': 'go',
 'filename': 'controllers.go',
 'file_exists': True,
 'file_hash': '13358f5485df15f54f7732b3248303449b611e207cb8ccb82f1638b8ecd7c2bb',
 'project': '/backend/',
 'relative_project_path': 'controllers/controllers.go',
 'relative_go_package': 'controllers/',
 'funcs': 'NewController,Init',
 'methods': '*controller.HomeController,*controller.StreamController',
 'project_path': '766dc882d779f07821bde740ce49802f67ae42b3/backend/'}