Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model.

In [None]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx

In [2]:
from preprocess_code import *
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")



In [None]:
data = data.iloc[0:100]

In [4]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
# dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

NameError: name 'generate_edge_list1' is not defined

In [6]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [7]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [8]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [9]:
# kernel dies when there are more than 200 datapoints

# adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

In [10]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

In [11]:
## TODO: in a DASK framework reset_index is not a recognized function like pandas, fix this bug

# adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrix

In [39]:
def generate_ast_roots(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    
    return ast_root

In [40]:
ast_roots = data.groupby(['testcase_ID']).apply(generate_ast_roots)

In [41]:
ast_roots

testcase_ID
62516    <clang.cindex.Cursor object at 0x7f79106d8ae8>
62517    <clang.cindex.Cursor object at 0x7f791061d158>
62518    <clang.cindex.Cursor object at 0x7f79104cda60>
62519    <clang.cindex.Cursor object at 0x7f7910392ae8>
62520    <clang.cindex.Cursor object at 0x7f791026dae8>
62521    <clang.cindex.Cursor object at 0x7f790393eae8>
62522    <clang.cindex.Cursor object at 0x7f790378a2f0>
62523    <clang.cindex.Cursor object at 0x7f790363ca60>
62524    <clang.cindex.Cursor object at 0x7f79034e6158>
62525    <clang.cindex.Cursor object at 0x7f7903384a60>
62526    <clang.cindex.Cursor object at 0x7f7903228400>
62527    <clang.cindex.Cursor object at 0x7f790308e620>
62528    <clang.cindex.Cursor object at 0x7f7902f2ac80>
62529    <clang.cindex.Cursor object at 0x7f7902dd0620>
62530    <clang.cindex.Cursor object at 0x7f7902c71f28>
62531    <clang.cindex.Cursor object at 0x7f7902b5ce18>
62532    <clang.cindex.Cursor object at 0x7f79029bad08>
62533    <clang.cindex.Cursor object

In [20]:
# features = {ast_roots.iloc[i].kind for i in range(len(ast_roots))}

In [42]:
ast_roots.children

AttributeError: 'Series' object has no attribute 'children'

In [34]:
def generate_features(ast_root):
    """
    Given a concretised & numbered clang ast, return a dictionary of
    features in the form:
        {
            <node_id>: [<degree>, <type>, <identifier>],
            ...
        }
    """
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree

        features[node.identifier] = [degree, str(node.kind), node.displayname]

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

In [35]:
generate_features_matrix(ast_roots)

AttributeError: 'Series' object has no attribute 'children'

There is also a sparse representation of the above data in "../data/adj.pickle".  This includes all buffer overflow data points.

In [18]:
adj_pickle = None
with open("../data/adj.pickle", 'rb') as f:
    adj_pickle = pickle.load(f)

In [15]:
adj_df

Unnamed: 0,testcase_ID,adj_matrix
0,62516,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
1,62517,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
2,62518,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
3,62519,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
4,62520,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
5,62521,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
6,62522,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
7,62523,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
8,62524,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...
9,62525,[[[[[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0...


In [24]:
adj_pickle

Unnamed: 0,0,1
0,-234271,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
1,-234259,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
2,-234243,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
3,-234233,"(0, 1)\t1\n (0, 2)\t1\n (1, 0)\t1\n (2, 0..."
4,-234213,"(0, 1)\t1\n (1, 0)\t1\n (1, 2)\t1\n (2, 1..."
5,-234132,"(0, 1)\t1\n (1, 0)\t1\n (1, 2)\t1\n (1, 5..."
6,-234110,"(0, 1)\t1\n (1, 0)\t1\n (1, 2)\t1\n (2, 1..."
7,-234101,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
8,-234084,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
9,-234063,"(0, 1)\t1\n (0, 2)\t1\n (0, 3)\t1\n (0, 4..."
