### Data Preprocessing
Convert a dataset generated by _process_line_level_data.py_ from a _per-function_ format to a _per-line_ format.

The resulting data entries are comprised of the following columns:
`func_id`, `line`, `vul`, `prev_line`, `next_line`.

Access to the complete function code can be joined using `func_id`.
Including the whole function code per line entry is not feasible due to memory considerations (size increase by ~3.5).

In [1]:
# input must be in format output by process_line_level_data.py
input_file = './big-vul_dataset/processed_data.csv'

output_file_lines = './big-vul_dataset/line_all.csv'
# enable to also save function code to separate file
save_functions_file = True
output_file_functions = './big-vul_dataset/functions_only_all.csv'

# enable to remove all whitespaces from the parsed code
# can be useful since further processing (i.e. CodeBERT) might be sensitve to whitespaces 
remove_whitespaces = False
# enable to remove comments from generated lines of code
remove_comments = True
# lines matched with these regex patterns are marked as non-vulnerable
non_vul_lines = ["^{$", "^}$", "^\($", "^\)$", "^\[$", "^\]$"]

In [2]:
import pandas as pd
import numpy as np
import math
import csv
import re
pattern_block_comment = re.compile("(\/\*(.|\n)*?\*\/)")
pattern_line_comment = re.compile("(\/\/.*)")

In [7]:
def remove_comments(code):
    nl_count_before = code.count('\n')
    
    for match in re.finditer(pattern_block_comment, code):
        # find block comment and count new lines
        num_lines = match.group().count('\n')
        # replace block with blank lines
        code = re.sub(pattern_block_comment, '\n' * num_lines, code, 1)
    # replace line comments
    code = re.sub(pattern_line_comment, '', code)
    
    # ensure replacing did not fail
    assert nl_count_before == code.count('\n')
    
    return code

def strip_string(s):
    if remove_whitespaces:
        return ''.join(s.split())
    else:
        return s.strip() 

# return an ordered list of all function lines (split by '\n')
def get_func_lines(entry):
    # remove comments (while retaining lines)
    code = entry['processed_func']
    if remove_comments:
        code = remove_comments(code)

    lines = code.split('\n')
    return list(map(strip_string, lines))

# check if line fits any regex pattern for non-vulnerable lines
def is_line_non_vulnerable(line_str):
    try:
        if non_vul_lines is not None:
            # remove all lines containing matching regex
            for regex in non_vul_lines:
                if re.match(regex, line_str):
                    return True
    except NameError:
        pass # no handling required
    
    return False

# return an ordered list of boolean labels indicating the vulnerability of a line
def get_line_vul_labels(entry):
    vul_indexes_raw = entry['flaw_line_index']
    flaw_indexes = [int(i) for i in vul_indexes_raw.split(',')] if type(vul_indexes_raw) == str else []
    # list of 0 ('False') for every line
    line_vul = [0] * len(entry['line'])
    # for each line of function set value to 1 ('True')
    for i in flaw_indexes:
        line_vul[i] = 1
    
    # update vul-labels according to regex definition above
    for i in range(0, len(line_vul)):
        if is_line_non_vulnerable(entry['line'][i]):
            line_vul[i] = 0
    
    # remove entries for empty lines (will be removed)
    for i in reversed(range(0, len(line_vul))):
        if len(entry['line'][i]) == 0:
            line_vul.pop(i)
    
    return line_vul

def remove_empty_lines(entry):
    return [l for l in entry['line'] if len(l) > 0]

# return an ordered list of all previous lines
def get_previous_lines(entry):
    prev_lines = [""] + entry['line']
    # remove last element
    prev_lines.pop()
    return prev_lines

# return an ordered list of all previous lines
def get_next_lines(entry):
    next_lines = entry['line'] + [""]
    # remove element element
    next_lines.pop(0)
    return next_lines

# remove comments from function code
def clean_function_code(entry):
    func_code = entry['processed_func']
    comments_replaced = remove_comments(func_code)
    comments_removed = "\n".join([s for s in comments_replaced.split("\n") if s])
    return comments_removed

In [4]:
# load dataset
cols = ['index', 'processed_func', 'flaw_line_index', 'target']
df = pd.read_csv(input_file, skipinitialspace=True, usecols=cols, low_memory = True)

  df = pd.read_csv(input_file, skipinitialspace=True, usecols=cols, low_memory = True)


In [5]:
# apply transformations to every row
df['line'] = df.apply(get_func_lines, axis = 1)
df['vul'] = df.apply(get_line_vul_labels, axis = 1)
df['line'] = df.apply(remove_empty_lines, axis = 1)
df['prev_line'] = df.apply(get_previous_lines, axis = 1)
df['next_line'] = df.apply(get_next_lines, axis = 1)

In [6]:
# explode df to per-line format
# courtesy of https://stackoverflow.com/a/59330040
filtered_df = df[['index', 'line', 'vul', 'prev_line', 'next_line']]
filtered_df.rename(columns={'index': 'func_id'}, inplace=True)

per_line_df = filtered_df.set_index(['func_id']).apply(pd.Series.explode).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.rename(columns={'index': 'func_id'}, inplace=True)


In [None]:
# save resulting dataset
per_line_df[['func_id', 'line', 'vul', 'prev_line', 'next_line']] \
.to_csv(output_file_lines, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [8]:
# export function-only dataset
if save_functions_file:
    df['processed_func'] = df.apply(clean_function_code, axis = 1)
    func_only_df = df[['index', 'processed_func', 'target']]
    func_only_df.rename(columns={'index': 'func_id'}, inplace=True)
    func_only_df.to_csv(output_file_functions, encoding='utf-8', quoting=csv.QUOTE_ALL)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  func_only_df.rename(columns={'index': 'func_id'}, inplace=True)
