In [2]:
import ast
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx

In [3]:
from preprocess_code import *
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [4]:
def generate_edge_list(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particulartestcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [5]:
dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = dask_data.groupby(['testcase_ID']).apply(
        generate_edge_list,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

In [6]:
def gen_adj_matrix(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])

    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [7]:
# create a dataframe containing the testcase ID and its adjacency matrix 

adjacency_df = pd.DataFrame()

In [8]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [9]:
adj_matrices = graphs.apply(gen_adj_matrix, meta = ('generate_adj_matrices', 'O'))

In [13]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

In [17]:
## TODO: fix this bug

adj_matrices = adj_matrices.reset_index(level='testcase_ID')

TypeError: reset_index() got an unexpected keyword argument 'level'

In [15]:
adjacency_df['adj_matrix'] = adj_matrices[0]

KeyError: 0

In [None]:
## TODO: export to csv