In [4]:
import pandas as pd
import json
import os

dataset_name = "FTSmartAudit_datasets"
output_dir = f"../../data/processed_data/{dataset_name}/"
raw_dir = "../../data/dataset/raw"

os.makedirs(output_dir, exist_ok=True)

csv_file = f"{raw_dir}/{dataset_name}.csv"
json_file = f"{output_dir}/{dataset_name}.json"

df = pd.read_csv(csv_file)

df.to_json(json_file, orient="records", indent=4)

print(f"CSV file '{csv_file}' successfully converted to JSON file '{json_file}'.")
df = pd.read_json(json_file)
#print(df.head(1))

CSV file '../../data/dataset/raw/FTSmartAudit_datasets.csv' successfully converted to JSON file '../../data/processed_data/FTSmartAudit_datasets//FTSmartAudit_datasets.json'.


In [9]:
import os
import json
import re
from tqdm import tqdm

# === Configuration ===
dataset_name = "FTSmartAudit_datasets"
raw_dir = "../../data/dataset/raw"
json_file_path = os.path.join(raw_dir, f"{dataset_name}.json")
base_output_dir = f"../../data/processed_data/{dataset_name}/"

# Define the vulnerability types we want to process.
desired_vuln_types = {"reentrancy", "time_manipulation", "arithmetic"}

# Create subfolders for each vulnerability type.
for vuln in desired_vuln_types:
    contracts_dir = os.path.join(base_output_dir, vuln, "contracts")
    locs_dir = os.path.join(base_output_dir, vuln, "LOCs")
    os.makedirs(contracts_dir, exist_ok=True)
    os.makedirs(locs_dir, exist_ok=True)

# --- Load dataset ---
with open(json_file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

print(f"Total contracts in dataset: {len(data)}")

# Process all contracts (or set a limit with end_contract)
end_contract = len(data)

for idx, record in tqdm(enumerate(data[:end_contract]), total=end_contract, desc="Processing Contracts"):
    # Normalize vulnerability type string.
    vuln_type = record.get("Vulnerability Type", "").strip().lower()
    if vuln_type not in desired_vuln_types:
        # Skip contracts that are not one of our desired types.
        continue

    # Set target folders based on vulnerability type.
    target_folder = vuln_type  # Folder name same as vulnerability type.
    contracts_dir = os.path.join(base_output_dir, target_folder, "contracts")
    locs_dir = os.path.join(base_output_dir, target_folder, "LOCs")

    # Extract the source code.
    source_code = record.get("Source Code", "")
    if not source_code:
        print(f"Skipping contract {idx}: No source code found.")
        continue

    # Save the entire contract as a .sol file.
    contract_filename = f"{idx}.sol"
    contract_path = os.path.join(contracts_dir, contract_filename)
    with open(contract_path, "w", encoding="utf-8") as f:
        f.write(source_code)

    # Extract the vulnerable line numbers using regex.
    # Expected format in the source code comment: "@vulnerable_at_lines: 54" or "23-25, 30"
    match = re.search(r"@vulnerable_at_lines:\s*([\d,\-\s]+)", source_code)
    if not match:
        print(f"Skipping contract {idx}: No vulnerable line numbers found in metadata.")
        continue

    vuln_line_str = match.group(1)
    # Parse the string into individual line numbers.
    line_numbers = []
    for part in vuln_line_str.split(","):
        part = part.strip()
        if "-" in part:
            try:
                start, end = map(int, part.split("-"))
                line_numbers.extend(range(start, end + 1))
            except Exception as e:
                print(f"Error parsing range '{part}' in contract {idx}: {e}")
                continue
        else:
            try:
                line_numbers.append(int(part))
            except Exception as e:
                print(f"Error parsing line '{part}' in contract {idx}: {e}")
                continue

    if not line_numbers:
        print(f"Skipping contract {idx}: No valid line numbers extracted.")
        continue

    # Split the source code into lines.
    source_lines = source_code.split("\n")
    # Extract the vulnerable code snippet (note: metadata uses 1-indexed line numbers)
    vulnerable_snippet = [source_lines[i - 1] for i in line_numbers if 1 <= i <= len(source_lines)]
    start_line = min(line_numbers)
    end_line = max(line_numbers)

    # Prepare the JSON output with vulnerability location.
    vulnerability_info = [
        {
            "start_line": start_line,
            "end_line": end_line,
            "code": vulnerable_snippet
        }
    ]

    # Save the vulnerability JSON file.
    vuln_json_filename = f"{idx}.json"
    vuln_json_path = os.path.join(locs_dir, vuln_json_filename)
    with open(vuln_json_path, "w", encoding="utf-8") as json_file:
        json.dump(vulnerability_info, json_file, indent=4)

    print(f"Processed contract {idx} ({vuln_type}): Saved contract and LOC info.")

print("Processing complete!")


Total contracts in dataset: 143


Processing Contracts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 1045.82it/s]

Processed contract 8 (reentrancy): Saved contract and LOC info.
Processed contract 9 (reentrancy): Saved contract and LOC info.
Processed contract 10 (reentrancy): Saved contract and LOC info.
Processed contract 11 (reentrancy): Saved contract and LOC info.
Processed contract 12 (reentrancy): Saved contract and LOC info.
Processed contract 13 (reentrancy): Saved contract and LOC info.
Processed contract 14 (reentrancy): Saved contract and LOC info.
Processed contract 15 (reentrancy): Saved contract and LOC info.
Processed contract 16 (reentrancy): Saved contract and LOC info.
Processed contract 17 (reentrancy): Saved contract and LOC info.
Processed contract 18 (reentrancy): Saved contract and LOC info.
Processed contract 19 (reentrancy): Saved contract and LOC info.
Processed contract 20 (reentrancy): Saved contract and LOC info.
Processed contract 21 (reentrancy): Saved contract and LOC info.
Processed contract 22 (reentrancy): Saved contract and LOC info.
Processed contract 23 (reen


