In [3]:
# Get list of fuzzing harnesses from Google
from pathlib import Path
import pandas as pd

google_dir = Path("/home/XXX/Code/trace-modeling/llm-fuzz/oss-fuzz-llm-targets-public").absolute()
dirs = [d for d in google_dir.glob("*/") if d.is_dir() and d.name not in ["benchmarks", "icons"]]
google_df = pd.DataFrame(data={"dir": dirs})
all_names = []
all_texts = []
for d in google_df["dir"]:
    harness_files = sorted(list(d.glob("targets/*.c")) + list(d.glob("targets/*.cc")))
    names = []
    texts = []
    for f in harness_files:
        names.append(f.name)
        texts.append(f.read_text())
    all_names.append(names)
    all_texts.append(texts)
google_df["harness_name"] = all_names
google_df["harness_text"] = all_texts
google_df["prompt"] = google_df["dir"].apply(lambda d: (d/"prompts.txt").read_text())
google_df["project"], _, google_df["function"] = zip(*google_df["dir"].apply(lambda d: d.name.partition("-")))
google_df

Unnamed: 0,dir,harness_name,harness_text,prompt,project,function
0,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,[],[],You are a security testing engineer who wants ...,xvid,xvid_encore
1,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,[],[],You are a security testing engineer who wants ...,tinyxml2,tinyxml2-xmldocument-print
2,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,[01.c],[#include <fuzzer/FuzzedDataProvider.h>\n#incl...,You are a security testing engineer who wants ...,expat,xml_externalentityparsercreate
3,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,"[01.c, 02.c]",[#include <stdio.h>\n#include <errno.h>\n#incl...,You are a security testing engineer who wants ...,libucl,ucl_object_merge
4,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,[],[],You are a security testing engineer who wants ...,tinyxml2,tinyxml2-xmlelement-insertnewunknown
...,...,...,...,...,...,...
65,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,"[01.c, 02.c]",[#include <fcntl.h> /* open() O_RDONLY */\n#in...,You are a security testing engineer who wants ...,libdwarf,dwarf_debug_addr_index_to_addr
66,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,"[01.c, 02.c, 03.c, 04.c]",[#include <fuzzer/FuzzedDataProvider.h>\n#incl...,You are a security testing engineer who wants ...,libucl,ucl_object_replace_key
67,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,"[01.c, 02.c, 03.c, 04.c, 05.c, 06.c, 07.c]",[#include <assert.h>\n#include <fcntl.h>\n#inc...,You are a security testing engineer who wants ...,elfutils,dwfl_module_relocate_address
68,/home/XXX/Code/trace-modeling/llm-fuzz/oss-...,[],[],You are a security testing engineer who wants ...,tinyxml2,tinyxml2-xmlelement-gettext


In [5]:
google_df[google_df["project"] == "libucl"].explode("harness_name").count()

dir             23
harness_name    23
harness_text    23
prompt          23
project         23
function        23
dtype: int64

In [2]:
# Copy each into a new dir under projects/
import yaml

projects_dir = Path("../../projects")

def load_language_from_yaml(project):
    """
    Load the 'language' key from a YAML file.
    """
    with open(projects_dir/project/"project.yaml", 'r') as file:
        data = yaml.safe_load(file)
        return data.get('language')

projects_written = []
n_harnesses_written = 0
for project, group in google_df.groupby("project"):
    this_project_dir = projects_dir/project
    if this_project_dir.exists() and load_language_from_yaml(project) == "c":
        harness_dir = this_project_dir/"llm-generated"
        harness_dir.mkdir(exist_ok=True)
        for i, row in group.iterrows():
            function = row['function']
            print(function, row["harness_name"])
            for name, text in zip(row["harness_name"], row["harness_text"]):
                name = f"harness_{function}_{name}"
                (harness_dir/name).write_text(text)
            n_harnesses_written += 1
        projects_written.append(project)
print(f"Wrote {len(projects_written)} projects, {n_harnesses_written} harnesses")
print("\n".join(projects_written))

dwarf_rnglists_get_rle_head ['01.c']
dwarf_find_die_given_sig8 ['01.c', '02.c', '03.c']
dwarf_debug_addr_index_to_addr ['01.c', '02.c']
sf_format_check ['01.cc']
sf_command ['01.cc', '02.cc']
ucl_object_merge ['01.c', '02.c']
ucl_parser_add_fd_priority ['01.c', '02.c']
ucl_parser_get_current_stack_object ['01.c', '02.c', '03.c', '04.c', '05.c']
ucl_comments_add ['01.c']
ucl_parser_insert_chunk ['01.c', '02.c']
ucl_comments_move ['01.c']
ucl_object_compare ['01.c', '02.c', '03.c', '04.c']
ucl_array_merge ['01.c', '02.c']
ucl_object_replace_key ['01.c', '02.c', '03.c', '04.c']
Wrote 3 projects, 14 harnesses
libdwarf
libsndfile
libucl


# Integrate into build script
1. Harnesses are saved in projects/<project>/llm-harnesses
2. Mount directory /llm-generated containing all the harnesses
3. Generate or write new build script which loops through the files, see libucl

```bash
$ for f in $(python infra/helper.py get_fuzz_targets libucl); do echo $f; done
harness_ucl_array_merge_01
harness_ucl_array_merge_02
harness_ucl_comments_move_01
harness_ucl_object_compare_01
harness_ucl_object_compare_02
harness_ucl_object_compare_03
harness_ucl_object_merge_01
harness_ucl_object_merge_02
harness_ucl_object_replace_key_03
harness_ucl_parser_add_fd_priority_01
harness_ucl_parser_add_fd_priority_02
harness_ucl_parser_get_current_stack_object_01
harness_ucl_parser_get_current_stack_object_02
harness_ucl_parser_get_current_stack_object_03
harness_ucl_parser_get_current_stack_object_04
harness_ucl_parser_get_current_stack_object_05
harness_ucl_parser_insert_chunk_01
harness_ucl_parser_insert_chunk_02
ucl_add_string_fuzzer
$ python infra/helper.py run_fuzzer --corpus-dir corpus_test/libucl/harness_ucl_array_merge_01 libucl harness_ucl_array_merge_01 max_total_time=60 seed=0 print_final_stats=1
```