Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model.

In [1]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx

In [3]:
from preprocess_code import *
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")



In [8]:
clang.cindex.Config.set_library_file('/lib/x86_64-linux-gnu/libclang-8.so.1')

In [4]:
data = data.iloc[0:100]

In [4]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
# dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

NameError: name 'generate_edge_list1' is not defined

In [6]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [7]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [8]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [9]:
# kernel dies when there are more than 200 datapoints

# adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

In [10]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

In [11]:
## TODO: in a DASK framework reset_index is not a recognized function like pandas, fix this bug

# adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrix

In [9]:
def concretise_ast(node):
    """
    Everytime you run .get_children() on a clang ast node, it
    gives you new objects. So if you want to modify those objects
    they will lose their changes everytime you walk the tree again.
    To avoid this problem, concretise_ast walks the tree once,
    saving the resulting list from .get_children() into a a concrete
    list inside the .children.
    You can then use .children to consistently walk over tree, and
    it will give you the same objects each time.
    """
    node.children = list(node.get_children())

    for child in node.children:
        counter = concretise_ast(child)

def number_ast_nodes(node, counter=1):
    """
    Given a concretised clang ast, assign each node with a unique
    numerical identifier. This will be accessible via the .identifier
    attribute of each node.
    """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter


def generate_ast_roots(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)
    
    return ast_root

In [10]:
ast_roots = data.groupby(['testcase_ID']).apply(generate_ast_roots)

In [32]:
example_node = ast_roots.iloc[0].children[19]
dir(example_node)

['__class__',
 '__ctypes_from_outparam__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_b_base_',
 '_b_needsfree_',
 '_displayname',
 '_fields_',
 '_kind_id',
 '_objects',
 '_tu',
 'access_specifier',
 'availability',
 'brief_comment',
 'canonical',
 'children',
 'data',
 'displayname',
 'enum_type',
 'enum_value',
 'exception_specification_kind',
 'extent',
 'from_cursor_result',
 'from_location',
 'from_result',
 'get_arguments',
 'get_bitfield_width',
 'get_children',
 'get_definition',
 'get_field_offsetof',
 'get_included_file',
 'get_num_template_arguments',
 'get_template_argument_kind',
 'get_template_argument_type',
 'get_template_argument_unsigned_val

In [46]:
example_node.brief_comment

In [15]:
def generate_features(ast_root):
    """
    Given a concretised & numbered clang ast, return a dictionary of
    features in the form:
        {
            <node_id>: [<degree>, <type>, <identifier>],
            ...
        }
    """
#     features =  set()
    features = {}


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
#         features.add(str(node.kind))

        features[node.identifier] = [str(node.kind), node.displayname]

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

In [16]:
feature_sets = ast_roots.apply(generate_features)

In [20]:
feature_sets.iloc[0]

{1: ['CursorKind.TRANSLATION_UNIT',
  '000/062/516/CWE121_Stack_Based_Buffer_Overflow__CWE129_connect_socket_01.c'],
 2: ['CursorKind.TYPEDEF_DECL', '__u_char'],
 3: ['CursorKind.TYPEDEF_DECL', '__u_short'],
 4: ['CursorKind.TYPEDEF_DECL', '__u_int'],
 5: ['CursorKind.TYPEDEF_DECL', '__u_long'],
 6: ['CursorKind.TYPEDEF_DECL', '__int8_t'],
 7: ['CursorKind.TYPEDEF_DECL', '__uint8_t'],
 8: ['CursorKind.TYPEDEF_DECL', '__int16_t'],
 9: ['CursorKind.TYPEDEF_DECL', '__uint16_t'],
 10: ['CursorKind.TYPEDEF_DECL', '__int32_t'],
 11: ['CursorKind.TYPEDEF_DECL', '__uint32_t'],
 12: ['CursorKind.TYPEDEF_DECL', '__int64_t'],
 13: ['CursorKind.TYPEDEF_DECL', '__uint64_t'],
 14: ['CursorKind.TYPEDEF_DECL', '__int_least8_t'],
 15: ['CursorKind.TYPE_REF', '__int8_t'],
 16: ['CursorKind.TYPEDEF_DECL', '__uint_least8_t'],
 17: ['CursorKind.TYPE_REF', '__uint8_t'],
 18: ['CursorKind.TYPEDEF_DECL', '__int_least16_t'],
 19: ['CursorKind.TYPE_REF', '__int16_t'],
 20: ['CursorKind.TYPEDEF_DECL', '__uint_le

In [42]:
final_features = set()
for i in range(len(feature_sets)):
    final_features.update(feature_sets.iloc[i])

AttributeError: 'Series' object has no attribute 'children'

In [None]:
final_features = pd.Series(list(final_features))

In [None]:
pd.get_dummies(final_features)