### Data Preprocessing
Convert a dataset generated by _process_line_level_data.py_ from a _per-function_ format to a _per-line_ format.

The resulting data entries are comprised of the following columns:
`func_id`, `line`, `vul`, `idx_in_func`.

Access to the complete function code can be joined using `func_id`.
Including the whole function code per line entry is not feasible due to memory considerations (size increase by ~3.5).

In [1]:
# input must be in format output by process_line_level_data.py
input_file = './big-vul_dataset/processed_data.csv'

output_file_lines = './big-vul_dataset/line_all.csv'
# enable to also save function code to separate file
save_functions_file = True
output_file_functions = './big-vul_dataset/functions_only_all.csv'

In [2]:
import pandas as pd
import numpy as np
import math
import csv

In [3]:
def strip_string(s):
    return s.strip()

# return an ordered list of all function lines (split by '\n')
def get_func_lines(entry):
    lines = entry['processed_func'].split('\n')
    # TODO: Is stripping necessary when using CodeBert?
    return list(map(strip_string, lines))

# return an ordered list of boolean labels indicating the vulnerability of a line
def get_line_vul_labels(entry):
    vul_indexes_raw = entry['flaw_line_index']
    flaw_indexes = [int(i) for i in vul_indexes_raw.split(',')] if type(vul_indexes_raw) == str else []
    # list of 0 ('False') for every line
    line_vul = [0] * len(entry['lines'])
    # for each line of function set value to 1 ('True')
    for i in flaw_indexes:
        line_vul[i] = 1
    
    return line_vul

# return an ordered list of all line indices of the function (useful for "exploding")
def get_line_indices(entry):
    return list(range(0, len(entry['lines'])))

In [5]:
# load dataset
cols = ['index', 'processed_func', 'flaw_line_index']
df = pd.read_csv(input_file, skipinitialspace=True, usecols=cols, low_memory = True)

Unnamed: 0.1,Unnamed: 0,func_id,line,vul,idx_in_func
0,0,0,static bool check_rodc_critical_attribute(stru...,0,0
1,1,0,{,0,1


In [None]:
# apply transformations to every row
df['lines'] = df.apply(get_func_lines, axis = 1)
df['vul'] = df.apply(get_line_vul_labels, axis = 1)
df['idx_in_func'] = df.apply(get_line_indices, axis = 1)

In [None]:
# explode df to per-line format
# courtesy of https://stackoverflow.com/a/59330040
filtered_df = df[['index', 'lines', 'vul', 'idx_in_func']]
filtered_df.rename(columns={'lines':'line', 'index': 'func_id'}, inplace=True)

per_line_df = filtered_df.set_index(['func_id']).apply(pd.Series.explode).reset_index()

In [None]:
# remove empty lines (add no valuable information)
per_line_df = per_line_df[per_line_df['line'].str.len() > 0]

In [None]:
# save resulting dataset
per_line_df[['func_id', 'line', 'vul', 'idx_in_func']].to_csv(output_file_lines, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [None]:
# export function-only dataset
if save_functions_file:
    func_only_df = df[['index', 'processed_func']]
    func_only_df.rename(columns={'index': 'func_id'}, inplace=True)
    func_only_df.to_csv(output_file_functions, encoding='utf-8', quoting=csv.QUOTE_ALL)