# ECS 289L Term Project - Preprocessing Pipeline

## 1. Downloading the Raw Vulnerability Dataset (DiverseVul)

In [1]:
#installing the required package(s)
%pip install gdown

Note: you may need to restart the kernel to use updated packages.


In [2]:
raw_dataset_name = 'DiverseVul_EntireDataset.json'

In [3]:
### IMPORTANT ###
# Since it is practically unfeasible to include the entire DiverseVul raw dataset (736.9 MB) as part of the submission, we commented the following lines of code.
# Interested readers are welcome to uncomment the code, and download the entire raw dataset.
# Nevertheless, next steps work regardless of whether the raw dataset is re-downloaded from scratch.


# import os

# # checks if raw dataset is already there
# if os.path.exists(f'./{raw_dataset_name}'):
#     print('DiverseVul dataset already exists.')
# else:
#     !gdown --id 12IWKhmLhq7qn5B_iXgn5YerOQtkH-6RG -O $raw_dataset_name


## 2. Filtering Dataset for Sidechannel-Related Entries

In [4]:
import json
import os


def parse_and_write_to_file(input_file, output_file, condition_func):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            try:
                line = line.strip()
                json_object = json.loads(line)
                # Check if the JSON object meets the specified condition
                if condition_func(json_object):
                    # Write the JSON object to the output file
                    outfile.write(json.dumps(json_object) + "\n")
            except:
                print(f"Error reading line '{line}'.")


# DiverseVul Condition
def condition_DiverseVul(json_object):
    return (
        ("side-channel" in json_object.get("message").lower())
        or ("sidechannel" in json_object.get("message").lower())
        or ("side channel" in json_object.get("message").lower())
    )


input_file_path = f"./{raw_dataset_name}"
sca_filtered_dataset_name = "SCA-filtered_Dataset.json"
output_file_path = f"./{sca_filtered_dataset_name}"

# checks if the filtered dataset is already there
if os.path.exists(f'./{sca_filtered_dataset_name}'):
    print('SCA-filtered dataset already exists.')
else:
    parse_and_write_to_file(input_file_path, output_file_path, condition_DiverseVul)


SCA-filtered dataset already exists.


## 3. Converting the Filtered Dataset into OpenAI Format

In [5]:
import json
import re


# Function to read JSON objects from a file
def read_json_objects_from_file(file_path):
    json_arr = []

    with open(file_path, "r") as file:
        # Read the entire file as a single string
        data = file.read()

        # Split the string into individual JSON objects
        json_strings = data.split("}\n{")

        # Process each JSON string
        for i, json_string in enumerate(json_strings):
            if i == 0:
                json_string += "}"  # Add back the closing bracket for the first object
            elif i == len(json_strings) - 1:
                json_string = ("{" + json_string)  # Add back the opening bracket for the last object
            else:
                json_string = ("{" + json_string + "}")  # Add both opening and closing brackets for other objects
            json_arr.append(json.loads(json_string))

    return json_arr


def filter_msg(text):
    pattern = r"([^\s]+)-by:\s+([^\s]+)([^<]+)<[^>]+>"

    return re.sub(pattern, "", text, count=0)


def transform_json_objects(input_file, output_file):
    with open(output_file, "w") as o_json:
        in_data = read_json_objects_from_file(input_file)

        for in_obj in in_data:
            filtered_obj = {
                "messages": [
                    {
                        "role": "system",
                        "content": "You're a smart assistant, aiding a cybersecurity researcher in pinpointing sidechannel-related CWEs.",
                    },
                    {
                        "role": "user",
                        "content": "Would you please review the following C code, list any CWE(s) found (if any), and provide a descriptive explanation: ",  # append code-snippet
                    },
                    {
                        "role": "assistant",
                        "content": "The code contains the following CWE(s): ",  # append CWE(s) & message
                    },
                ]
            }

            filtered_obj["messages"][1]["content"] += f"\"{in_obj['func']}\"?"
            filtered_obj["messages"][2]["content"] += f"{', '.join(in_obj['cwe'])}. {filter_msg(in_obj['message'])}?"

            o_json.write(json.dumps(filtered_obj) + "\n")

    return


# Example usage:
input_file_path = f"./{sca_filtered_dataset_name}"  # input JSON file
preprocessing_product_name = "Formatted_SCA-filtered_Dataset.jsonl" # output JSONL file
output_file_path = f"./{preprocessing_product_name}"


# checks if the formatted&filtered dataset is already there
if os.path.exists(output_file_path):
    print('Formatted_SCA-filtered dataset already exists.')
else:
    transform_json_objects(input_file_path, output_file_path)


Formatted_SCA-filtered dataset already exists.


In [6]:
import random

def train_test_split(input_file, train_file, test_file, split_ratio=0.8, seed=None):
    with open(input_file, 'r') as f:
        entries = f.readlines()
    
    # Shuffle entries
    if seed is not None:
        random.seed(seed)
    random.shuffle(entries)
    
    # Split entries
    split_idx = int(len(entries) * split_ratio)
    train_entries = entries[:split_idx]
    test_entries = entries[split_idx:]
    
    # Write to train file
    with open(train_file, 'w') as f:
        f.writelines(train_entries)
    
    # Write to test file
    with open(test_file, 'w') as f:
        f.writelines(test_entries)

# Example usage
input_file_path = f"{preprocessing_product_name}"
train_file = "train.jsonl"
val_file = "val.jsonl"

# checks if the train/test files are already there
if os.path.exists(f'./{train_file}') and os.path.exists(f'./{val_file}'):
    print('Train/Test splits already exist.')
else:
    train_test_split(input_file_path, f"./{train_file}", f"./{val_file}", split_ratio=0.8)


Train/Test splits already exist.
