In [1]:
from pathlib import Path
import json
import re
import os
from collections import defaultdict

In [2]:
def extract_from_brackets(content: str, brackets:str="{}") -> str:
    left_bracket, right_bracket = brackets[0], brackets[1]
    num_open = 0
    extracted_content = ""
    inside_brackets = False

    for char in content:
        if char == left_bracket:
            inside_brackets = True
            num_open += 1
        elif char == right_bracket:
            num_open -= 1
        if num_open > 0:
            extracted_content += char
        elif inside_brackets:
            extracted_content += right_bracket
            break
    # remove brackets
    return extracted_content[1:-1].strip()

def get_class_content(file_content: str, class_name: str) -> tuple[str, str]:
    class_arguments = ""
    class_split = file_content.split(f"class {class_name}")
    if len(class_split) == 1:
        class_split = file_content.split(f"object {class_name}")
    class_content = class_split[1]
    # class_content = class_content.split("class ")[0].strip()
    if class_content[0]=="(":
        class_arguments = extract_from_brackets(class_content, brackets="()")
        class_arguments = "(" + class_arguments + ")"
        class_content = class_content.split(class_arguments[1:-1])[1].strip()[1:]
    class_content = extract_from_brackets(class_content)

    return class_content, class_arguments

def extract_methods(class_content: str) -> list[dict[str, str]]:

    methods = []
    method_contents = class_content.split("fun")[1:]
    for method_content in method_contents:
        method_name = method_content.split("(")[0].strip()
        method_arguments = "(" + extract_from_brackets(method_content, brackets="()") + ")"
        methods.append({"method_name": method_name, "method_arguments": method_arguments})

    return methods

def extract_imports(file_content: str) -> str:
    imports = "\n".join(re.findall(r'^\s*import\s+[^\n]+', file_content, re.MULTILINE)).strip()
    return imports

def parse_class(file_content: str, class_name: str) -> dict[str, object]:

    class_content, class_arguments = get_class_content(file_content, class_name)
    imports = extract_imports(file_content)
    methods = extract_methods(class_content)

    return {"file_imports": imports, "class_methods": methods, "class_content": class_content, "class_arguments": class_arguments}

In [3]:
folder = Path("C:\\Timur\\Varios for job\\Data\\kotlin-test-based-bench")
dataset_file = folder / "kotlin-test-dataset_with_descriptions_final.json"
dataset_file_edited = folder / "kotlin-test-dataset_with_descriptions_final_file_content.json"

with open(dataset_file, 'r') as file:
    dataset = json.load(file)

In [4]:
failed_repo_tests = defaultdict(list)
num_not_class = 0
err = False

for repo_point in dataset:
    repo_name = repo_point['repository']['url'].split('/')[-1][:-4]
    for test_point in repo_point['tests']:
        target_file_path = folder / "repos" / repo_name / test_point["construct"]["path"]
        if os.path.exists(target_file_path):
            class_name = test_point["construct"]["methodName"]
            with open(target_file_path, 'r') as file:
                file_content = file.read()
            class_content_parsed = parse_class(file_content, class_name)
            # test_point["class_name"] = class_name
            # test_point.update(class_content_parsed)
            test_point["file_content"] = file_content
        # else:
        #     failed_repo_tests[repo_name].append(test_point["construct"]["path"])
        #     test_point["class_content"] = ""
        #     test_point["class_methods"] = ""
        #     test_point["file_imports"] = ""
    if err:
        break

In [5]:
with open(dataset_file_edited, 'w') as file:
    json.dump(dataset, file, indent=4)