In [8]:
import json
import os
from tqdm import tqdm

# Set up paths relative to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
dataset_name = "smartbugs_reentrancy"

# Define input JSON file location
raw_dir = os.path.join(project_root, "data", "dataset", "raw")
json_file_path = os.path.join(raw_dir, "smartbugs_reentrancy.json")

# Define output directory for numbered Solidity files
output_dir = os.path.join(project_root, "data", "processed_data", dataset_name, "vulnerable_numbering_contracts")
os.makedirs(output_dir, exist_ok=True)

# Load the JSON data
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Process all contracts; change this if you want to limit the number processed
end_contract = len(data)

# For each contract, create a .sol file with numbered lines if the "output" field starts with "1"
for contract_index, record in tqdm(enumerate(data[:end_contract]),
                                   total=end_contract,
                                   desc="Processing Contracts"):
    # Check if the "output" field exists and starts with "1"
    output_field = record.get("output", "").strip()
    if not output_field.startswith("1"):
        print(f"Skipping contract {contract_index} because its output does not start with '1'")
        continue

    sol_file_path = os.path.join(output_dir, f"{contract_index}.sol")
    if os.path.exists(sol_file_path):
        print(f"Contract {contract_index} is already processed. Skipping...")
        continue

    # Get the contract code from the 'input' field
    contract = record.get("input", "")
    
    # Add line numbers to the contract
    lines = contract.split("\n")
    numbered_lines = [f"{i+1}: {line}" for i, line in enumerate(lines)]
    numbered_contract = "\n".join(numbered_lines)
    
    # Write the numbered contract into a .sol file
    with open(sol_file_path, 'w', encoding='utf-8') as f:
        f.write(numbered_contract)
    
    print(f"Processed contract {contract_index}.")

Processing Contracts:   7%|██████▎                                                                                        | 108/1635 [00:00<00:01, 1062.05it/s]

Processed contract 0.
Processed contract 1.
Processed contract 2.
Skipping contract 3 because its output does not start with '1'
Processed contract 4.
Processed contract 5.
Processed contract 6.
Skipping contract 7 because its output does not start with '1'
Skipping contract 8 because its output does not start with '1'
Processed contract 9.
Skipping contract 10 because its output does not start with '1'
Processed contract 11.
Skipping contract 12 because its output does not start with '1'
Skipping contract 13 because its output does not start with '1'
Processed contract 14.
Skipping contract 15 because its output does not start with '1'
Processed contract 16.
Processed contract 17.
Skipping contract 18 because its output does not start with '1'
Skipping contract 19 because its output does not start with '1'
Processed contract 20.
Processed contract 21.
Processed contract 22.
Processed contract 23.
Skipping contract 24 because its output does not start with '1'
Processed contract 25.
Pr

Processing Contracts:  25%|███████████████████████▎                                                                       | 401/1635 [00:00<00:00, 1357.03it/s]

Processed contract 253.
Processed contract 254.
Processed contract 255.
Processed contract 256.
Skipping contract 257 because its output does not start with '1'
Processed contract 258.
Processed contract 259.
Processed contract 260.
Skipping contract 261 because its output does not start with '1'
Processed contract 262.
Processed contract 263.
Processed contract 264.
Processed contract 265.
Skipping contract 266 because its output does not start with '1'
Processed contract 267.
Processed contract 268.
Processed contract 269.
Processed contract 270.
Skipping contract 271 because its output does not start with '1'
Skipping contract 272 because its output does not start with '1'
Skipping contract 273 because its output does not start with '1'
Processed contract 274.
Processed contract 275.
Processed contract 276.
Processed contract 277.
Processed contract 278.
Skipping contract 279 because its output does not start with '1'
Skipping contract 280 because its output does not start with '1'


Processing Contracts:  57%|██████████████████████████████████████████████████████                                         | 931/1635 [00:00<00:00, 1640.97it/s]

Processed contract 564.
Processed contract 565.
Skipping contract 566 because its output does not start with '1'
Processed contract 567.
Processed contract 568.
Processed contract 569.
Skipping contract 570 because its output does not start with '1'
Processed contract 571.
Skipping contract 572 because its output does not start with '1'
Processed contract 573.
Skipping contract 574 because its output does not start with '1'
Processed contract 575.
Processed contract 576.
Processed contract 577.
Processed contract 578.
Processed contract 579.
Processed contract 580.
Processed contract 581.
Processed contract 582.
Processed contract 583.
Processed contract 584.
Processed contract 585.
Skipping contract 586 because its output does not start with '1'
Skipping contract 587 because its output does not start with '1'
Processed contract 588.
Processed contract 589.
Processed contract 590.
Skipping contract 591 because its output does not start with '1'
Processed contract 592.
Processed contrac

Processing Contracts:  79%|██████████████████████████████████████████████████████████████████████████▏                   | 1290/1635 [00:00<00:00, 1727.87it/s]

Processed contract 931.
Processed contract 932.
Processed contract 933.
Processed contract 934.
Processed contract 935.
Processed contract 936.
Processed contract 937.
Skipping contract 938 because its output does not start with '1'
Processed contract 939.
Processed contract 940.
Processed contract 941.
Processed contract 942.
Processed contract 943.
Processed contract 944.
Processed contract 945.
Processed contract 946.
Skipping contract 947 because its output does not start with '1'
Processed contract 948.
Processed contract 949.
Processed contract 950.
Processed contract 951.
Processed contract 952.
Skipping contract 953 because its output does not start with '1'
Processed contract 954.
Processed contract 955.
Skipping contract 956 because its output does not start with '1'
Processed contract 957.
Processed contract 958.
Skipping contract 959 because its output does not start with '1'
Processed contract 960.
Processed contract 961.
Processed contract 962.
Skipping contract 963 becau

Processing Contracts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1635/1635 [00:00<00:00, 1640.35it/s]

Processed contract 1290.
Skipping contract 1291 because its output does not start with '1'
Skipping contract 1292 because its output does not start with '1'
Processed contract 1293.
Processed contract 1294.
Processed contract 1295.
Processed contract 1296.
Skipping contract 1297 because its output does not start with '1'
Processed contract 1298.
Processed contract 1299.
Processed contract 1300.
Processed contract 1301.
Processed contract 1302.
Processed contract 1303.
Processed contract 1304.
Processed contract 1305.
Processed contract 1306.
Processed contract 1307.
Processed contract 1308.
Processed contract 1309.
Processed contract 1310.
Processed contract 1311.
Processed contract 1312.
Skipping contract 1313 because its output does not start with '1'
Processed contract 1314.
Processed contract 1315.
Processed contract 1316.
Processed contract 1317.
Processed contract 1318.
Processed contract 1319.
Processed contract 1320.
Processed contract 1321.
Skipping contract 1322 because its o




In [None]:
### following code is used for three sub-datasets: IoU/TD/RE_FTSmartAudit_datasets

In [4]:
import json
import os
from tqdm import tqdm

# Set up paths relative to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
dataset_name = "IoU_FTSmartAudit_datasets"

# Define input JSON file location
# raw_dir = os.path.join(project_root, "data", "dataset", "few_shots", "few_shots_IoU_FTSmartAudit_datasets")
# json_file_path = os.path.join(raw_dir, "11.json")

raw_dir = os.path.join(project_root, "data", "dataset", "raw")
json_file_path = os.path.join(raw_dir, "IoU_FTSmartAudit_datasets.json")


# Define output directory for numbered Solidity files
output_dir = os.path.join(project_root, "data", "processed_data", dataset_name, "vulnerable_numbering_contracts")
os.makedirs(output_dir, exist_ok=True)

# Load the JSON data
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Process all contracts; change this if you want to limit the number processed
end_contract = len(data)

# For each contract, create a .sol file with numbered lines if the "output" field starts with "1"
for contract_index, record in tqdm(enumerate(data[:end_contract]),
                                   total=end_contract,
                                   desc="Processing Contracts"):
    # Check if the "output" field exists and starts with "1"
    sol_file_path = os.path.join(output_dir, f"{contract_index}.sol")
    if os.path.exists(sol_file_path):
        print(f"Contract {contract_index} is already processed. Skipping...")
        continue

    # Get the contract code from the 'input' field
    contract = record.get("input", "")
    
    # Add line numbers to the contract
    lines = contract.split("\n")
    numbered_lines = [f"{i+1}: {line}" for i, line in enumerate(lines)]
    numbered_contract = "\n".join(numbered_lines)
    
    # Write the numbered contract into a .sol file
    with open(sol_file_path, 'w', encoding='utf-8') as f:
        f.write(numbered_contract)
    
    print(f"Processed contract {contract_index}.")

Processing Contracts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 1507.73it/s]

Processed contract 0.
Processed contract 1.
Processed contract 2.
Processed contract 3.
Processed contract 4.
Processed contract 5.
Processed contract 6.
Processed contract 7.
Processed contract 8.
Processed contract 9.
Processed contract 10.
Contract 11 is already processed. Skipping...
Processed contract 12.
Processed contract 13.
Processed contract 14.





In [None]:
### here, I wanna use the input section of fewshot json file to check the numbering is the same as:
#####  the sol file saved in vulnerable_numbering_contracts in processed_data

In [21]:
import json
import os

# Use the current working directory
current_dir = os.getcwd()

# Define the paths for the JSON input file and the output Solidity file
json_file_path = os.path.join(current_dir, "1.json")
sol_file_path = os.path.join(current_dir, "1.sol")

# Load the JSON data from 1.json
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract the Solidity code from the 'input' field of the first record
solidity_code = data[0].get("input", "")

# Save the extracted Solidity code into 1.sol
with open(sol_file_path, "w", encoding="utf-8") as f:
    f.write(solidity_code)

print("Solidity code saved in 1.sol.")


Solidity code saved in 1.sol.
