Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model.

In [None]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
from preprocess_code import *

In [None]:
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [None]:
with open("../data/adj.pickle",'rb') as f:
    adj = pickle.load(f)

In [None]:
labels = data.copy()
del labels['Unnamed: 0']
del labels['Unnamed: 0.1']
del labels['filename']
del labels['code']
del labels['flaw']
del labels['flaw_loc']
labels = labels.drop_duplicates().sort_values('testcase_ID').reset_index(drop=True)

In [None]:
adj = adj.rename(columns={0: 'testcase_ID', 1: 'matrix'})

In [None]:
adj_df = pd.merge(labels, adj, on='testcase_ID')

In [None]:
adj_df = adj_df[['testcase_ID', 'matrix', 'bug']]

In [None]:
adj_df['matrix_size'] = adj_df.matrix.apply(lambda x: x.shape[0])

In [None]:
matrix_size = 614
adj_df = adj_df[adj_df['matrix_size'] <= matrix_size]

In [None]:
dat = data[data.set_index(['testcase_ID']).index.isin(adj_df.set_index(['testcase_ID']).index)]

In [None]:
np.random.seed(1248)

In [None]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
# dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

In [None]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [16]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [17]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [18]:
# kernel dies when there are more than 200 datapoints

# adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

NameError: name 'graphs' is not defined

In [19]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

NameError: name 'adj_matrices' is not defined

In [20]:
## TODO: in a DASK framework reset_index is not a recognized function like pandas, fix this bug

# adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

NameError: name 'adj_matrices' is not defined

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrix

In [21]:
def concretise_ast(node):
    """
    Everytime you run .get_children() on a clang ast node, it
    gives you new objects. So if you want to modify those objects
    they will lose their changes everytime you walk the tree again.
    To avoid this problem, concretise_ast walks the tree once,
    saving the resulting list from .get_children() into a a concrete
    list inside the .children.
    You can then use .children to consistently walk over tree, and
    it will give you the same objects each time.
    """
    node.children = list(node.get_children())

    for child in node.children:
        counter = concretise_ast(child)

def number_ast_nodes(node, counter=1):
    """
    Given a concretised clang ast, assign each node with a unique
    numerical identifier. This will be accessible via the .identifier
    attribute of each node.
    """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter


def generate_ast_roots(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)
    
    return ast_root

In [None]:
# Sam's code

In [21]:
ast_roots = dat.groupby(['testcase_ID']).apply(generate_ast_roots)

In [None]:
# try some other code

In [None]:
ast_roots = dat.groupby(['testcase_ID']).apply(generate_ast_roots,axis='columns',
        meta=('generate_ast_roots', 'unicode'),)

In [None]:
for index, row in ast_roots.iteritems():
    import pdb;pdb.set_trace()
    print(row)
        # do matrix append maybe if kernel dies

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Error in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/usr/lib/python3.7/shutil.py", line 501, in rmtree
    onerror(os.path.islink, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 499, in rmtree
    raise OSError("Cannot call rmtree on a symbolic link")
OSError: Cannot call rmtree on a symbolic link


In [259]:
# example_node = ast_roots.iloc[0].children[19]
# dir(example_node)

Getting the columns for the feature matrix:

In [None]:
def generate_colnames(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node kinds to be used as columns in feature matrix
    """
    features =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        features.add(str(node.kind))


        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features


def generate_spelling(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node spellings to be used later
    in constructing the columns in feature matrix
    """
    spelling =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        spelling.add(node.spelling)

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return spelling

Creating unique set of node kinds and node spellings

In [None]:
colnames = ast_roots.apply(generate_colnames)
spelling = ast_roots.apply(generate_spelling)

Obtaining a set of all final columns in feature matrix to be one-hot encoded

In [None]:
final_colnames = set()
final_colnames.update(['Identifier', 'WriteToPointer', 'SizeOf', 'Alloc'])
for i in range(len(colnames)):
    final_colnames.update(colnames.iloc[i])

Set of all node spellings to fish out the important ones

In [None]:
final_spelling = set()
for i in range(len(spelling)):
    final_spelling.update(spelling.iloc[i])

In [None]:
final_colnames = pd.Series(list(final_colnames))

In [None]:
final_colnames

In [427]:
# for i in range(len(final_colnames)):
#     final_colnames.iloc[i] = 'kind_' + final_colnames.iloc[i]

Manually pick out important node spellings 

In [273]:
# [feature for feature in final_spelling if 'Alloc' in feature]

In [None]:
alloc_list = ['__builtin_alloca', 
              '__alloc', 
              'malloc', 
              'valloc', 
              '__alloc_on_copy', 
              '__alloc_on_move', 
              'calloc', 
              'realloc', 
              'alloca',
              'ALLOCA'
             ]

sizeOf_list = ['std::aligned_storage<sizeof(_Tp), __alignof(_Tp)>'
              ]

writeToPointer_list = ['__builtin_memmove', 
                       '__builtin_memcpy', 
                       'wmempcpy', 
                       'wmemmove'
                      ]

In [None]:
final_df = pd.DataFrame(columns = final_colnames)

In [None]:
final_df

In [325]:
# final_df = pd.get_dummies(final_colnames)

Creating the feature matrix:

In [None]:
def generate_features_matrix(ast_root):
    """
    Given a concretised & numbered clang ast, returns a matrix of one hot encoded features of node names kind and 
    whether it's alloc/writeToPointer/sizeOf/other, i.e. our feature matrix
    """
    index = []
    kind = {}
    spelling = {}
    
    matrix_df = final_df.copy()

    def walk_tree_and_set_properties(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        index.append(node.identifier)
        
        kind[node.identifier] = node.kind
        spelling[node.identifier] = node.spelling
        
        if str(node.spelling) in writeToPointer_list:
            spelling[node.identifier] = 'WriteToPointer'
        
        elif str(node.spelling) in sizeOf_list:
            spelling[node.identifier] = 'SizeOf'
            
        elif str(node.spelling) in alloc_list:
            spelling[node.identifier] = 'Alloca'
        
        else:
            spelling[node.identifier] = ''
        

        for child in node.children:
            walk_tree_and_set_properties(child)

    walk_tree_and_set_properties(ast_root)
    
#     return index
    
    d = {'Identifier': index, 'kind': list(kind.values()), 'spelling': list(spelling.values())}
        
    ast_df = pd.DataFrame(data = d)
    ast_df = ast_df.set_index('Identifier')
    
    dum_df = pd.get_dummies(ast_df, prefix=['kind', 'spelling'])
    
    dum_df = dum_df.drop('spelling_', axis=1)
    
    for col in dum_df.filter(regex='kind_*').columns:
        dum_df = dum_df.rename(columns = {col: col.replace('kind_', '')})
    
    matrix_df['Identifier'] = range(1,len(dum_df)+1)
    matrix_df = matrix_df.set_index('Identifier')
    matrix_df = matrix_df.fillna(0)
    
#     df_merge_col = pd.merge(dum_df, matrix_df, on='Identifier')
    for i in matrix_df.columns:
        if i not in dum_df.columns:
            dum_df[i]=0
# #     for col in ['SizeOf', 'Alloc', 'WriteToPointer']:
# #         if df_merge_col[col].isna().any():
# #             df_merge_col[col].fillna(0)
            
# #     df_merge_col = df_merge_col.dropna(axis='columns')
    
#     df_merge_col = df_merge_col.set_index('Identifier')
    
#     return df_merge_col

    return dum_df.values

In [None]:
eg = generate_features_matrix(ast_roots.iloc[0])

In [None]:
eg

In [59]:
mat = ast_roots.apply(generate_features_matrix)

In [60]:
mat

testcase_ID
62550    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62562    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62563    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62564    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62565    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62566    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62567    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62568    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62569    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62570    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62571    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62572    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62573    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62574    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62575    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62576    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
62577    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [61]:
mat.iloc[1]

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])