In [16]:
import json
import os, sys
from openai import OpenAI
from tqdm.notebook import tqdm
import time
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)
from config.keys import OPENAI_API_KEY

In [17]:
import os
import requests

class Generator:
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)


        self.json_formatter = "Return the response in RFC8259 compliant JSON according to the ResponseFormat schema with no other text."
        self.message = [{
            "role": "system",
            "content": 
                "You are a cyber-security programmer that can detect line numbers from the contract based on the instruction."
        }]
        self.output_formatter = """

This should instruct the model to output exactly the vulnerability lines, ensuring it doesn't output extra lines or large ranges that contain unrelated code.

Response Schema:
 [
    {
      "start_line": <exact_start_line_number>,
      "end_line": <exact_end_line_number>,
      "code": [
        "vulnerable line 1",
        "vulnerable line 2",
        "... (and so on)"
      ]
    }
  ]


** Do not use ```json or any other extra texts in the output. Include only the list of detected lines as the schema.
"""
        self.user_prefix = """You are given a smart contract code snippet and an explanation document on how to detect vulnerabilities. Your task is to identify and extract the exact lines of code where a vulnerability occurs—only the specific lines that are vulnerable, not any extra context or surrounding code.
Instructions:
1. Input Data:
    - Explanation: A detailed document containing guidelines for detecting vulnerabilities.
    - Smart Contract Code: The smart contract code is provided under the <Smart contract> tag.

2. Task Requirements:
    - Use the explanation guidelines to precisely locate all of the vulnerabilities in the code.
    - Extract only the exact lines of code that are vulnerable.
    - Do not provide a broad range of line numbers that include additional non-vulnerable lines. Instead, Be precise for the vulnerability lines and pinpoint the start and end lines where the vulnerability occurs, ensuring the extraction is minimal and exact.

3. Output Requirements:
    - Return your output as RFC8259 compliant JSON with no additional text.
    - The output should include:
        -- The exact start line number of the vulnerable code segment.
        -- The exact end line number of the vulnerable code segment.
        -- An array containing each exact line of vulnerable code.
    ** Do not include any lines of code that are not directly related to the vulnerability. **

"""

    def get_user_message(self, dataset_output, contract):
        self.user_content = f"""
{self.user_prefix}


This is the helping document to find the lines of vulnerable codes.
Guideline:
{dataset_output}

Smart contract code:
{contract}

Additional Note:

Be precise: Be noticed that most of the vulnerabilities occur in few lines. if the vulnerability is only on a few lines (for example, lines 215 to 218), only output those lines. Avoid outputting large ranges that include non-vulnerable lines.
Do not include any commentary or extraneous information outside of the JSON output.
Do not return the entire function or big code snippet. Specifically return small snippets with vulnerability.
---
{self.output_formatter}

###
"""
        self.user_message = {"role": "user", "content": self.user_content}

    def create_prompt(self, dataset_output, contract):
        self.get_user_message(dataset_output, contract)
        self.message.append(self.user_message)

    def generate(self):
        
        completion = self.client.chat.completions.create(
          model="gpt-4o-mini",
          messages = self.message,
          temperature=0.1,
          max_tokens=3200,
          top_p=1.,
          frequency_penalty=0,
          presence_penalty=0,
          stop=None
        )
        answer = completion.choices[0].message.content
        return answer, completion


In [18]:
dataset_name = "smartbugs_reentrancy"
output_dir = f"../../data/processed_data/{dataset_name}/"
os.makedirs(output_dir, exist_ok=True)
locs_dir = os.path.join(output_dir, "LOCs")
os.makedirs(locs_dir, exist_ok=True)
contracts_dir = os.path.join(output_dir, "contracts")
os.makedirs(contracts_dir, exist_ok=True)
raw_dir = "../../data/dataset/raw"

In [19]:
end_contract =  500

with open(f"{raw_dir}/{dataset_name}.json", "r",  encoding="utf-8") as file:
    data = json.load(file)
len(data)

1635

In [20]:
for contract_index, record in tqdm(enumerate(data[:end_contract]),
                                   total=end_contract,
                                   desc="Processing Contracts"):

    existing_locs = os.listdir(locs_dir)
    existing_contracts = os.listdir(contracts_dir)
    if f"{contract_index}.json" in existing_locs:
        print(f"    Contarct {contract_index} is already processed. Going to the next...")
        continue
    
    contract = record['input']
    with open(os.path.join(contracts_dir, f"{contract_index}.sol"), 'w') as f:
        f.write(contract)
    lines = contract.split("\n")
    numbered_lines = [f"{i+1}: {line}" for i, line in enumerate(lines)]
    record['input'] = "\n".join(numbered_lines)
    print(f"Checking contract {contract_index}...")

    if record["output"][0] == "0":
        print(f"    Contarct {contract_index} is safe. Going to the next...")
        continue
        
    generator = Generator()
    generator.create_prompt( record['output'], record['input'])

    print(f"Generating vulnerability lines for Contract {contract_index}...")
    start_time = time.time()
    answer, completion = generator.generate()
    print(f"Json output generated in {round(time.time() - start_time, 2)}")
    json_filename = os.path.join(locs_dir, f"{contract_index}.json") 
    try:
        json_answer = json.loads(answer)
    except:
        json_answer = answer
        json_filename = json_filename[:-5]+ "___"+".json"
    with open(json_filename, "w", encoding="utf-8") as json_file:
        json.dump(json_answer, json_file, indent=4)

    print(f"Saved: {json_filename}")
    print(50*"-")

print("Process done!")

Processing Contracts:   0%|          | 0/500 [00:00<?, ?it/s]

    Contarct 0 is already processed. Going to the next...
    Contarct 1 is already processed. Going to the next...
    Contarct 2 is already processed. Going to the next...
Checking contract 3...
    Contarct 3 is safe. Going to the next...
    Contarct 4 is already processed. Going to the next...
    Contarct 5 is already processed. Going to the next...
    Contarct 6 is already processed. Going to the next...
Checking contract 7...
    Contarct 7 is safe. Going to the next...
Checking contract 8...
    Contarct 8 is safe. Going to the next...
    Contarct 9 is already processed. Going to the next...
Checking contract 10...
    Contarct 10 is safe. Going to the next...
    Contarct 11 is already processed. Going to the next...
Checking contract 12...
    Contarct 12 is safe. Going to the next...
Checking contract 13...
    Contarct 13 is safe. Going to the next...
    Contarct 14 is already processed. Going to the next...
Checking contract 15...
    Contarct 15 is safe. Going to the 

In [42]:
data[0]['output']

"1. The contract contains reentrancy vulnerabilities due to the use of low-level call instructions to transfer Ether without ensuring that the state is updated before the Ether is sent. Specifically, in the function __callback(), lines containing the low-level call instruction are susceptible. If an attacker controls the destination address, they can execute a reentrant call back into the contract before the contract's state is updated to reflect the withdrawal. This allows the attacker to withdraw funds repeatedly without the contract's balance being reduced. Additionally, the function playerWithdrawPendingTransactions() uses a low-level call to send Ether to the player's address but does not check the success of the transaction immediately after updating the player's pending withdrawal balance. This creates another opportunity for a reentrant attack where an attacker could attempt to withdraw funds multiple times before the contract updates its state."

In [11]:
len(json_answer)

6