Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model.

In [253]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
from preprocess_code import *

In [255]:
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [256]:
data = data.iloc[0:500]
# data = data.iloc[0:100]

In [4]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
# dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

NameError: name 'generate_edge_list1' is not defined

In [6]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [7]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [8]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [9]:
# kernel dies when there are more than 200 datapoints

# adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

In [10]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

In [11]:
## TODO: in a DASK framework reset_index is not a recognized function like pandas, fix this bug

# adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrix

In [463]:
def concretise_ast(node):
    """
    Everytime you run .get_children() on a clang ast node, it
    gives you new objects. So if you want to modify those objects
    they will lose their changes everytime you walk the tree again.
    To avoid this problem, concretise_ast walks the tree once,
    saving the resulting list from .get_children() into a a concrete
    list inside the .children.
    You can then use .children to consistently walk over tree, and
    it will give you the same objects each time.
    """
    node.children = list(node.get_children())

    for child in node.children:
        counter = concretise_ast(child)

def number_ast_nodes(node, counter=1):
    """
    Given a concretised clang ast, assign each node with a unique
    numerical identifier. This will be accessible via the .identifier
    attribute of each node.
    """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter


def generate_ast_roots(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)
    
    return ast_root

In [464]:
ast_roots = data.groupby(['testcase_ID']).apply(generate_ast_roots)

In [259]:
# example_node = ast_roots.iloc[0].children[19]
# dir(example_node)

Getting the columns for the feature matrix:

In [465]:
def generate_colnames(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node kinds to be used as columns in feature matrix
    """
    features =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        features.add(str(node.kind))


        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features


def generate_spelling(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node spellings to be used later
    in constructing the columns in feature matrix
    """
    spelling =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        spelling.add(node.spelling)

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return spelling

Creating unique set of node kinds and node spellings

In [466]:
colnames = ast_roots.apply(generate_colnames)
spelling = ast_roots.apply(generate_spelling)

Obtaining a set of all final columns in feature matrix to be one-hot encoded

In [467]:
final_colnames = set()
final_colnames.update(['Identifier', 'WriteToPointer', 'SizeOf', 'Alloc'])
for i in range(len(feature_sets)):
    final_colnames.update(feature_sets.iloc[i])

Set of all node spellings to fish out the important ones

In [468]:
final_spelling = set()
for i in range(len(spelling)):
    final_spelling.update(spelling.iloc[i])

In [469]:
final_colnames = pd.Series(list(final_colnames))

In [427]:
# for i in range(len(final_colnames)):
#     final_colnames.iloc[i] = 'kind_' + final_colnames.iloc[i]

Manually pick out important node spellings 

In [273]:
# [feature for feature in final_spelling if 'Alloc' in feature]

In [470]:
alloc_list = ['__builtin_alloca', 
              '__alloc', 
              'malloc', 
              'valloc', 
              '__alloc_on_copy', 
              '__alloc_on_move', 
              'calloc', 
              'realloc', 
              'alloca',
              'ALLOCA'
             ]

sizeOf_list = ['std::aligned_storage<sizeof(_Tp), __alignof(_Tp)>'
              ]

writeToPointer_list = ['__builtin_memmove', 
                       '__builtin_memcpy', 
                       'wmempcpy', 
                       'wmemmove'
                      ]

In [471]:
final_df = pd.DataFrame(columns = final_colnames)

In [472]:
final_df

Unnamed: 0,CursorKind.PACK_EXPANSION_EXPR,CursorKind.DECL_STMT,CursorKind.IF_STMT,CursorKind.TYPEDEF_DECL,CursorKind.CXX_STATIC_CAST_EXPR,CursorKind.CXX_NULL_PTR_LITERAL_EXPR,CursorKind.UNEXPOSED_EXPR,CursorKind.OVERLOADED_DECL_REF,CursorKind.CLASS_TEMPLATE,CursorKind.STRING_LITERAL,...,Identifier,CursorKind.NAMESPACE_REF,SizeOf,CursorKind.SIZE_OF_PACK_EXPR,CursorKind.MEMBER_REF,CursorKind.INIT_LIST_EXPR,CursorKind.LAMBDA_EXPR,CursorKind.CXX_TYPEID_EXPR,CursorKind.CONST_ATTR,CursorKind.COMPOUND_ASSIGNMENT_OPERATOR


In [325]:
# final_df = pd.get_dummies(final_colnames)

Creating the feature matrix:

In [499]:
def generate_features_matrix(ast_root, final_df):
    """
    Given a concretised & numbered clang ast, returns a matrix of one hot encoded features of node names kind and 
    whether it's alloc/writeToPointer/sizeOf/other, i.e. our feature matrix
    """
    index = []
    kind = {}
    spelling = {}
    
    matrix_df = final_df.copy()

    def walk_tree_and_set_properties(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        index.append(node.identifier)
        
        kind[node.identifier] = node.kind
        spelling[node.identifier] = node.spelling
        
        if str(node.spelling) in writeToPointer_list:
            spelling[node.identifier] = 'WriteToPointer'
        
        elif str(node.spelling) in sizeOf_list:
            spelling[node.identifier] = 'SizeOf'
            
        elif str(node.spelling) in alloc_list:
            spelling[node.identifier] = 'Alloca'
        
        else:
            spelling[node.identifier] = ''
        

        for child in node.children:
            walk_tree_and_set_properties(child)

    walk_tree_and_set_properties(ast_root)
    
#     return index
    
    d = {'Identifier': index, 'kind': list(kind.values()), 'spelling': list(spelling.values())}
        
    ast_df = pd.DataFrame(data = d)
    ast_df = ast_df.set_index('Identifier')
    
    dum_df = pd.get_dummies(ast_df, prefix=['kind', 'spelling'])
    
    dum_df = dum_df.drop('spelling_', axis=1)
    
    for col in dum_df.filter(regex='kind_*').columns:
        dum_df = dum_df.rename(columns = {col: col.replace('kind_', '')})
    
    matrix_df['Identifier'] = range(1,len(dum_df)+1)
    
    df_merge_col = pd.merge(dum_df, matrix_df, on='Identifier')
    
#     TODO: fix here need to drop the NA columns besides the ['SizeOf', 'WriteToPointer', 'Alloc']
    
    
#     for col in df_merge_col.columns:
#         if df_merge_col[col].isnull().any() and df_merge_col[col] not in ['SizeOf', 'WriteToPointer', 'Alloc']:
#             df_merge_col[col] = df_merge_col.dropna(axis=1, how='all')
    
    df_merge_col = df_merge_col.set_index('Identifier')
    
    return df_merge_col

#     return dum_df.values

In [500]:
eg = generate_features_matrix(ast_roots.iloc[0], final_df)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [498]:
eg.isnull().any()

CursorKind.TRANSLATION_UNIT_x                       False
CursorKind.TYPEDEF_DECL_x                           False
CursorKind.TYPE_REF_x                               False
CursorKind.STRUCT_DECL_x                            False
CursorKind.FIELD_DECL_x                             False
CursorKind.INTEGER_LITERAL_x                        False
CursorKind.UNEXPOSED_ATTR_x                         False
CursorKind.FUNCTION_DECL_x                          False
CursorKind.PARM_DECL_x                              False
CursorKind.COMPOUND_STMT_x                          False
CursorKind.RETURN_STMT_x                            False
CursorKind.PAREN_EXPR_x                             False
CursorKind.CSTYLE_CAST_EXPR_x                       False
CursorKind.BINARY_OPERATOR_x                        False
CursorKind.UNEXPOSED_EXPR_x                         False
CursorKind.DECL_REF_EXPR_x                          False
CursorKind.CXX_UNARY_EXPR_x                         False
CursorKind.UNI

In [495]:
eg['SizeOf']

Identifier
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
30      0
       ..
2212    0
2213    0
2214    0
2215    0
2216    0
2217    0
2218    0
2219    0
2220    0
2221    0
2222    0
2223    0
2224    0
2225    0
2226    0
2227    0
2228    0
2229    0
2230    0
2231    0
2232    0
2233    0
2234    0
2235    0
2236    0
2237    0
2238    0
2239    0
2240    0
2241    0
Name: SizeOf, Length: 2241, dtype: int64

In [489]:
eg.filter(regex='CursorKind*').dropna(axis=1, how='all')

Unnamed: 0_level_0,CursorKind.TRANSLATION_UNIT_x,CursorKind.TYPEDEF_DECL_x,CursorKind.TYPE_REF_x,CursorKind.STRUCT_DECL_x,CursorKind.FIELD_DECL_x,CursorKind.INTEGER_LITERAL_x,CursorKind.UNEXPOSED_ATTR_x,CursorKind.FUNCTION_DECL_x,CursorKind.PARM_DECL_x,CursorKind.COMPOUND_STMT_x,...,CursorKind.INIT_LIST_EXPR_x,CursorKind.IF_STMT_x,CursorKind.ARRAY_SUBSCRIPT_EXPR_x,CursorKind.FOR_STMT_x,CursorKind.CALL_EXPR_x,CursorKind.STRING_LITERAL_x,CursorKind.DO_STMT_x,CursorKind.BREAK_STMT_x,CursorKind.MEMBER_REF_EXPR_x,CursorKind.CHARACTER_LITERAL_x
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [370]:
def attempt(ast_root, final_df):
    """
    Given a concretised & numbered clang ast, returns a matrix of one hot encoded features of node names kind and 
    whether it's alloc/writeToPointer/sizeOf/other, i.e. our feature matrix
    """
    matrix_df = pd.DataFrame([len(final_df.columns)*[0]], columns=final_df.columns)
    
    def walk_tree_and_set_properties(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        index = node.identifier
        
        matrix_df['Identifier'].iloc[index] = node.identifier
        
        for colname in matrix_df.columns:
            if node.kind == colname:
                matrix_df[colname].iloc[index] = 1
#             else:
#                 matrix_df[colname].iloc[int(node.identifier)] = 0
                
        if node.spelling in writeToPointer_list:
            matrix_df['WriteToPointer'].iloc[index] = 1
        elif node.spelling in sizeOf_list:
            matrix_df['SizeOf'].iloc[index] = 1
        elif node.spelling in alloc_list:
            matrix_df['Alloc'].iloc[index] = 1
        
        for child in node.children:
            walk_tree_and_set_properties(child)

    walk_tree_and_set_properties(ast_root)
        
    matrix_df = matrix_df.set_index('Identifier')

    return matrix_df.values

In [316]:
ast_roots.iloc[0]

<clang.cindex.Cursor at 0x7fc9912ae0d0>

In [371]:
attempt(ast_roots.iloc[0], final_df)

IndexError: single positional indexer is out-of-bounds

In [318]:
mat = ast_roots.apply(generate_features_matrix)

In [378]:
generate_features_matrix(ast_roots.iloc[0])

Unnamed: 0_level_0,kind_CursorKind.TRANSLATION_UNIT,kind_CursorKind.TYPEDEF_DECL,kind_CursorKind.TYPE_REF,kind_CursorKind.STRUCT_DECL,kind_CursorKind.FIELD_DECL,kind_CursorKind.INTEGER_LITERAL,kind_CursorKind.UNEXPOSED_ATTR,kind_CursorKind.FUNCTION_DECL,kind_CursorKind.PARM_DECL,kind_CursorKind.COMPOUND_STMT,...,kind_CursorKind.INIT_LIST_EXPR,kind_CursorKind.IF_STMT,kind_CursorKind.ARRAY_SUBSCRIPT_EXPR,kind_CursorKind.FOR_STMT,kind_CursorKind.CALL_EXPR,kind_CursorKind.STRING_LITERAL,kind_CursorKind.DO_STMT,kind_CursorKind.BREAK_STMT,kind_CursorKind.MEMBER_REF_EXPR,kind_CursorKind.CHARACTER_LITERAL
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [288]:
mat.iloc[1]

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

Sam: I think in order to get a dataframe of the feature matrices where the matrices are all the same size we need to append each testcase_ID in ast_roots to an exisiting dataframe that has all the 'node.kind' names as well as 'alloc', 'writeToPointer', 'sizeOf' for columns which already exists under final_colnames. To convert this into a matrix just use .values to each dataframe and store this into a 'master' dataframe with all the matrices. This still needs to be applied to all the datapoints, as I have currently done this with 500 datapoints. 