# The following auxilliary notebook uses gpt-4o to generate cleaned version of the vulnerable code

In [None]:
import json
import os

from dotenv import load_dotenv
load_dotenv('api_key.env')
api_key = os.getenv('API_KEY')

from openai import OpenAI
client = OpenAI(api_key=api_key)

## Customized functions

In [None]:
def file_parser(file_path):
    json_objects = []
    with open(file_path, 'r') as file:
        object_tmp = ''
        for line in file:
            if line.startswith('{'):
                object_tmp = line
            else:
                object_tmp = object_tmp + line
                if line.startswith('}'):
                    json_objects.append(json.loads(object_tmp))
                    object_tmp = ''
    return json_objects

In [None]:
def get_prompt(json_object):
    return "Here is the original code:\n\\\"" + json_object['func'] + "\\\"\nHere is the comment:\n\\\"" + json_object['message'] + "\\\""

def get_message(header_json, prompt):
    request_json = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt
            }
        ]
    }
    message = [header_json, request_json]
    return message

In [None]:
def get_response(client, message):
    return client.chat.completions.create(
        model="gpt-4o",
        messages= message,
        temperature=1,
        max_tokens=4095,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

In [None]:
def save_response(response):
    with open('response.json', 'a+') as file:
        json.dump(response.to_dict(), file, indent=4)
        file.write('\n')

In [None]:
def response_parser(response_object):
    text = response_object['choices'][0]['message']['content']
    if '```C\n' in text:
        delimeter = '```C\n'
    elif '```c++\n' in text:
        delimeter = '```c++\n'
    elif '```cpp\n' in text:
        delimeter = '```cpp\n'
    elif '```c\n' in text:
        delimeter = '```c\n'
    else:
        delimeter = '```'
        code = text.split(delimeter)[1]
        comment = text.split(delimeter)[2]
        return code, comment

    content = text.split(delimeter)[1]
    code = content.split('\n```')[0]
    comment = content.split('\n```')[1]
    return code, comment    

## Initialization

In [None]:
json_files = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))

In [None]:
json_file = 'DiverseVul_input.json'

json_objects = file_parser(json_file)

keys_to_remove = ['target', 'project', 'commit_id', 'hash', 'size', 'cwe']
for json_object in json_objects:
    for key in keys_to_remove:
        if key in json_object:
            del json_object[key]

In [None]:
header_json = {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": "You will get original code and comments from user message. You need to modify the original code based on comments. Make sure to generate the full code and a brief introduction of what are modified."
        }
      ]
    }

## Get Raw Responses (Clean Code Generation)

In [None]:
total = len(json_objects)
count = 0
for json_object in json_objects:
    count += 1
    print(f"Processing {count} of {total}")
    prompt = get_prompt(json_object)
    message = get_message(header_json, prompt)
    response = get_response(client, message)
    save_response(response)

## Parse Raw Responses and generate the output file

In [None]:
response_object = response_objects[508]
response = response_parser(response_object)

In [None]:
response_objects = file_parser('response.json')
total = len(response_objects)
count = 0
output_file = 'clean_code.json'
for response_object in response_objects:
    tmp_result = {}
    count += 1
    print(f"Processing {count} of {total}")
    code, comment = response_parser(response_object)
    tmp_result['func'] = code
    tmp_result['message'] = comment
    with open(output_file, 'a+') as file:
        json.dump(tmp_result, file, indent=4)
        file.write('\n')
    

## Process the clean code json, remove the header for the comments

In [None]:
clean_code_objects = file_parser('clean_code.json')
total = len(clean_code_objects)
count = 0
processed_clean_code_objects = []
for clean_code_object in clean_code_objects:
    count += 1
    print(f"Processing {count} of {total}")
    
    comment = clean_code_object['message'].strip()
    try:
        comment = comment.split('\n', 1)[1]
        comment = comment.strip()
    except:
        pass
    code = clean_code_object['func']
    processed_clean_code_objects.append({'func': code, 'message': comment})
    with open('processed_clean_code.json', 'a+') as file:
        json.dump({'func': code, 'message': comment}, file, indent=4)
        file.write('\n')
    

        

In [None]:
comment = clean_code_object['message'].strip()
print(comment)

In [None]:
comment = clean_code_object['message'].strip()
comment = comment.split('\n', 1)[1]
print(comment)
comment = comment.strip()
print(comment)