In [3]:
import json,re

In [4]:
def extract_functions(code_lines):
    functions = []
    current_function = []
    inside_function = False
    brace_count = 0
    start_line = None

    for line in code_lines:
        if ':' not in line:
            continue
        line_num, content = line.split(':', 1)
        line_num = int(line_num.strip())

        if re.search(r'\bfunction\b', content) and not inside_function:
            inside_function = True
            brace_count = content.count('{') - content.count('}')
            current_function = [line]
            start_line = line_num
        elif inside_function:
            brace_count += content.count('{') - content.count('}')
            current_function.append(line)
            if brace_count == 0:
                end_line = line_num
                functions.append((start_line, end_line, current_function))
                inside_function = False
    return functions

def extract_vulnerable_functions(entry):
    instruction = entry["instruction"]
    input_lines = entry["input"].splitlines()
    base_prompt = input_lines[0] + "\n"
    code_lines = input_lines[1:]

    # Parse output as list or dict
    if isinstance(entry["output"], str):
        try:
            output = json.loads(entry["output"])
        except json.JSONDecodeError:
            return []
    else:
        output = entry["output"]

    if isinstance(output, dict):
        vulnerabilities = output.get("vulnerabilities", [])
    elif isinstance(output, list):
        vulnerabilities = output
    else:
        vulnerabilities = []

    functions = extract_functions(code_lines)
    results = []

    for vuln in vulnerabilities:
        vuln_start = int(vuln["vulnerableLines"].split('-')[0])
        for start, end, func_lines in functions:
            if start <= vuln_start <= end:
                formatted_func = "\n".join(func_lines)
                results.append({
                    "instruction": instruction,
                    "input": base_prompt + formatted_func,
                    "output": {
                        "vulnerabilities": [
                            {
                                "vulnerableLines": vuln["vulnerableLines"],
                                "vulnerableCode": vuln["vulnerableCode"],
                                "vulnerabilityReason": vuln["vulnerabilityReason"],
                                "potentialSecurityRisk": vuln["potentialSecurityRisk"],
                                "fixedCode": vuln["fixedCode"]
                            }
                        ]
                    }
                })
                break
    return results


if __name__ == "__main__":
    input_name = "vul_alpaca_test_data"
    input_dir = f"{input_name}.json"
    output_dir = f"functioned_{input_name}.json"

    with open(input_dir, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []
    for entry in data:
        extracted_data.extend(extract_vulnerable_functions(entry))

    with open(output_dir, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=2)

    print(f"✅ Saved extracted vulnerable functions to: {output_dir}")


FileNotFoundError: [Errno 2] No such file or directory: 'vul_alpaca_test_data.json'