In [48]:
import json
import os, sys
from openai import OpenAI

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)
from config.keys import OPENAI_API_KEY

In [98]:
import os
import requests

class Generator:
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)


        self.json_formatter = "Return the response in RFC8259 compliant JSON according to the ResponseFormat schema with no other text."
        self.message = [{
            "role": "system",
            "content": 
                "You are a cyber-security programmer that can detect line numbers from the contract based on the instruction."
        }]
        self.output_formatter = """
Response Schema:
[
  {"start_line": <los1>, 
   "end_line": <loe1>, 
   "code": [
        "vulnerable line 1",
        "vulnerable line 2",
        "... (and so on)"
      ]},
  {"start_line": <los2>, 
   "end_line": <loe2>, 
   "code": [
        "vulnerable line 1",
        "vulnerable line 2",
        "... (and so on)"
      ]},
...
]

** Do not use ```json or any other extra texts in the output. Include only the list of detected lines as the schema.
"""
        self.user_prefix = """You are given a smart contract code snippet and an explanation document on how to detect vulnerabilities. Your task is to identify and extract all of the exact lines of code where a vulnerability occurs—only the specific lines that are vulnerable, not any extra context or surrounding code. You should follow the instructions and report all vulnerable line as output. 
What I want is to have the exact lines of vulnerable code, the start and the end lines of vulnerable pieces of code for the given code.
Instructions:
1. Input Data:
    - Explanation: A detailed document containing guidelines for detecting vulnerabilities under the <This is the helping document to find the lines of vulnerable codes.> tag.
    - Smart Contract Code: The smart contract code is provided under the <Smart contract code> tag.

2. Task Requirements:
    - Use the explanation guidelines to precisely locate all of the vulnerabilities in the code.
    - Do NOT leave any vulnerable lines behind.
    - Extract only the exact lines of code that are vulnerable.
    - Do not provide a broad range of line numbers that include additional non-vulnerable lines. Instead, pinpoint the start and end lines where the vulnerability occurs, ensuring the extraction is exact.
    - write all of the vulnerable lines.
    - If multiple vulnerabilities exist within a single function (or code block), list each vulnerability as a separate entry with its own start and end line numbers. Do not merge them into one broad range.
    - Only output the exact vulnerable lines.

3. Output Requirements:
    - Return your output as RFC8259 compliant JSON with no additional text.
    - The output should include:
        -- The exact start line number of the vulnerable code segment.
        -- The exact end line number of the vulnerable code segment.
        -- An array containing each exact line of vulnerable code.
"""

    def get_user_message(self, dataset_output, contract):
        self.user_content = f"""
{self.user_prefix}

Additional Note:

Be precise: if the vulnerability is only on a few lines (for example, lines 215 to 218), only output those lines. Avoid outputting large ranges that include non-vulnerable lines.
Do not include any commentary or extraneous information outside of the JSON output.

This is the helping document to find the lines of vulnerable codes.
{dataset_output}

Smart contract code:
{contract}

---
{self.output_formatter}

###
"""
        self.user_message = {"role": "user", "content": self.user_content}

    def create_prompt(self, dataset_output, contract):
        self.get_user_message(dataset_output, contract)
        self.message.append(self.user_message)

    def generate(self):
        
        completion = self.client.chat.completions.create(
          model="gpt-4o-mini",
          messages = self.message,
          temperature=0.5,
          max_tokens=3200,
          top_p=1.,
          frequency_penalty=0,
          presence_penalty=0,
          stop=None
        )
        answer = completion.choices[0].message.content
        return answer, completion


In [102]:
### added:
# Directory for storing the JSON outputs
output_dir = "../../data/dataset/lines/"
os.makedirs(output_dir, exist_ok=True)
start_contract = 1  # Change this to start from contract X
end_contract =  1
# Counter to track the number of vulnerable contracts found
vulnerable_count = len([f for f in os.listdir(output_dir) if f.endswith(".json")])  # Resume tracking
with open("../../data/dataset/smartbugs_reentrancy.json", "r",  encoding="utf-8") as file:
    data = json.load(file)
# Track how many vulnerable contracts we have processed in the given range
#processed_vulnerable = 0

for contract_index, record in enumerate(data, start=1):                          #### added start = 1 to start from 1 for counting
    if contract_index < start_contract:
        continue
        
    contract = record['input']
    lines = contract.split("\n")
    numbered_lines = [f"{i+1}: {line}" for i, line in enumerate(lines)]
    record['input'] = "\n".join(numbered_lines)
  #  print(record["output"][0])
    print(f"Checking contract {contract_index}...")                              ###### added

    # Only process vulnerable contracts (output = "1")
    if record["output"][0] == "1":
        #vulnerable_count += 1  # Track only vulnerable contracts for naming     ############# added
        #processed_vulnerable += 1

        generator = Generator()
        generator.create_prompt( record['output'], record['input'])
        print(generator.message[1]['content'])
        # Stop processing after reaching `end_contract`

        print(f"Generating vulnerability lines for Contract {contract_index}...")
        answer, completion = generator.generate()
        print(answer)
        json_answer = json.loads(answer)
# Save the extracted vulnerability lines
        json_filename = os.path.join(output_dir, f"{contract_index}.json")  # Use vulnerable contract count for naming
        with open(json_filename, "w", encoding="utf-8") as json_file:
            json.dump(json_answer, json_file, indent=4)

        print(f"Saved: {json_filename}")

    if contract_index >= end_contract:
        break

Checking contract 51...

You are given a smart contract code snippet and an explanation document on how to detect vulnerabilities. Your task is to identify and extract all of the exact lines of code where a vulnerability occurs—only the specific lines that are vulnerable, not any extra context or surrounding code. You should follow the instructions and report all vulnerable line as output. 
What I want is to have the exact lines of vulnerable code, the start and the end lines of vulnerable pieces of code for the given code.
Instructions:
1. Input Data:
    - Explanation: A detailed document containing guidelines for detecting vulnerabilities under the <This is the helping document to find the lines of vulnerable codes.> tag.
    - Smart Contract Code: The smart contract code is provided under the <Smart contract code> tag.

2. Task Requirements:
    - Use the explanation guidelines to precisely locate all of the vulnerabilities in the code.
    - Do NOT leave any vulnerable lines behin

In [95]:
print(answer)

[
    {"start_line": 263, "end_line": 267},
    {"start_line": 299, "end_line": 303},
    {"start_line": 351, "end_line": 351},
    {"start_line": 467, "end_line": 467}
]


In [68]:
json_answer[0]['start_line']

338