In [69]:
import ast
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx

In [70]:
from preprocess_code import *
data = pd.read_csv('../data/buffer_overflow_data.csv.gz')
# data = data.iloc[0:100]

In [71]:
def generate_edge_list(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip or
    vdisc_*.csv.gz (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the baseline model.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)

    # Construct a list of features for each node
#     features = generate_features(ast_root)

    graph2vec_representation = {
        "edges": edgelist,
#         "features": features,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(graph2vec_representation)

In [73]:
# data = dd.from_pandas(data, npartitions=20)

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

In [87]:
def gen_adj_matrix(testcase):
    '''
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created'''

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])

    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [88]:
adjacency_df = pd.DataFrame()

In [89]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [None]:
adj_matrices = graphs.apply(gen_adj_matrix)

In [9]:
adj_matrices = pd.DataFrame(adj_matrices)

In [10]:
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

In [11]:
adjacency_df['adj_matrix'] = adj_matrices[0]