In [1]:
import pickle
from igraph import *

In [2]:
batch_id = "014"

In [3]:
with open("../SARD_pkl/pdg_db_dir{}.pkl".format(batch_id), "rb") as f:
    dt0 = pickle.load(f)
with open("../SARD_pkl/pdg_db_dir{}_1hop.pkl".format(batch_id), "rb") as f:
    dt = pickle.load(f)

In [4]:
with open("../SARD_raw/points_dir{}/arrayuse_slice_points.pkl".format(batch_id), "rb") as f:
    dt_arrayuse = pickle.load(f)

# data format: [
#     ([<node_id>, ...], <root_node_id>, <api_name>),
#     ...
# ]
# e.g., (['407684'], '407677', 'fclose')
with open("../SARD_raw/points_dir{}/sensifunc_slice_points.pkl".format(batch_id), "rb") as f:
    dt_sensitive = pickle.load(f)
    
with open("../SARD_raw/points_dir{}/integeroverflow_slice_points_new.pkl".format(batch_id), "rb") as f:
    dt_integer = pickle.load(f)
    
with open("../SARD_raw/points_dir{}/pointuse_slice_points.pkl".format(batch_id), "rb") as f:
    dt_pointuse = pickle.load(f)

### first extract all interesting subgraph (subgraph slicing, intra procedural)
- note that all projects will be flatten

In [5]:
# first index the whole dataset
# <node_id>: (first_key, second_key, index)

# note-important: should still use dt0 (0-hop) data here
#                 since this data structure is associating a node to its primary location
tmp_cnt = 0
dt_node2kk = {}
for p in dt0.keys():
    tmp_cnt += 1
    print("\r# processing {}/{}".format(tmp_cnt, len(dt0)), end="")
    for q in dt0[p].keys():
        for r in dt0[p][q].vs:
            assert r["name"] not in dt_node2kk.keys()
            dt_node2kk[r["name"]] = (p,q,r.index)

# processing 1000/1000

In [6]:
def worklist_add_preds(g, me):
    worklist = [me.index]
    retlist = []
    nth = 0
    while nth < len(worklist):
        curr = worklist[nth]
        retlist.append(curr)
        for p in g.vs[curr].predecessors():
            if p.index not in worklist:
                worklist.append(p.index)
        nth += 1
    return retlist

In [7]:
# then process different kinds of interesting nodes
tmp_cnt = 0
arrayuse_subgraphs = []
for p in dt_arrayuse.keys():
    tmp_cnt += 1
    print("\r# processing {}/{}, valid: {}".format(tmp_cnt, len(dt_arrayuse), len(arrayuse_subgraphs)), end="")
    for s in dt_arrayuse[p]:
        for q in s[0]:
            if q in dt_node2kk.keys():
                k0, k1, k2 = dt_node2kk[q]
                tmp_inode = dt[k0][k1].vs[k2]
                tmp_nodes = worklist_add_preds(dt[k0][k1], tmp_inode)
                tmp_subgraph = dt[k0][k1].subgraph(tmp_nodes)
                # (subgraph, name of interesting node)
                # ideally you should use index, but since the index may change between graphs
                # still here we use 'name' attribute which is globally unique
                arrayuse_subgraphs.append((tmp_subgraph, tmp_inode['name']))

# processing 993/993, valid: 5288

In [8]:
# then process different kinds of interesting nodes
tmp_cnt = 0
sensitive_subgraphs = []
for p in dt_sensitive.keys():
    tmp_cnt += 1
    print("\r# processing {}/{}, valid: {}".format(tmp_cnt, len(dt_sensitive), len(sensitive_subgraphs)), end="")
    for s in dt_sensitive[p]:
        for q in s[0]:
            if q in dt_node2kk.keys():
                k0, k1, k2 = dt_node2kk[q]
                tmp_inode = dt[k0][k1].vs[k2]
                tmp_nodes = worklist_add_preds(dt[k0][k1], tmp_inode)
                tmp_subgraph = dt[k0][k1].subgraph(tmp_nodes)
                # (subgraph, name of interesting node)
                # ideally you should use index, but since the index may change between graphs
                # still here we use 'name' attribute which is globally unique
                sensitive_subgraphs.append((tmp_subgraph, tmp_inode['name']))

# processing 976/976, valid: 11918

In [9]:
# then process different kinds of interesting nodes
tmp_cnt = 0
integer_subgraphs = []
for p in dt_integer.keys():
    tmp_cnt += 1
    print("\r# processing {}/{}, valid: {}".format(tmp_cnt, len(dt_integer), len(integer_subgraphs)), end="")
    for s in dt_integer[p]:
        for q in s[0]:
            if q in dt_node2kk.keys():
                k0, k1, k2 = dt_node2kk[q]
                tmp_inode = dt[k0][k1].vs[k2]
                tmp_nodes = worklist_add_preds(dt[k0][k1], tmp_inode)
                tmp_subgraph = dt[k0][k1].subgraph(tmp_nodes)
                # (subgraph, name of interesting node)
                # ideally you should use index, but since the index may change between graphs
                # still here we use 'name' attribute which is globally unique
                integer_subgraphs.append((tmp_subgraph, tmp_inode['name']))

# processing 153/153, valid: 363

In [10]:
# then process different kinds of interesting nodes
tmp_cnt = 0
pointuse_subgraphs = []
for p in dt_pointuse.keys():
    tmp_cnt += 1
    print("\r# processing {}/{}, valid: {}".format(tmp_cnt, len(dt_pointuse), len(pointuse_subgraphs)), end="")
    for s in dt_pointuse[p]:
        for q in s[0]:
            if q in dt_node2kk.keys():
                k0, k1, k2 = dt_node2kk[q]
                tmp_inode = dt[k0][k1].vs[k2]
                tmp_nodes = worklist_add_preds(dt[k0][k1], tmp_inode)
                tmp_subgraph = dt[k0][k1].subgraph(tmp_nodes)
                # (subgraph, name of interesting node)
                # ideally you should use index, but since the index may change between graphs
                # still here we use 'name' attribute which is globally unique
                pointuse_subgraphs.append((tmp_subgraph, tmp_inode['name']))

# processing 976/976, valid: 11717

### then infer labels for every subgraph

In [11]:
def get_function_nodes(g):
    return [p for p in g.vs if p["type"]=="Function"]

In [12]:
# all_subgraphs = arrayuse_subgraphs + sensitive_subgraphs + integer_subgraphs + pointuse_subgraphs
all_subgraphs = arrayuse_subgraphs
# all_subgraphs = sensitive_subgraphs
# all_subgraphs = integer_subgraphs
# all_subgraphs = pointuse_subgraphs


all_subgraphs_labeled = []
for pp in all_subgraphs:
    print("\r# processing {}/{}".format(len(all_subgraphs_labeled), len(all_subgraphs)), end="")
    sg, _ = pp
    tmp_func_list = get_function_nodes(sg)
    # assert len(tmp_func_list) >= 1
    if len(tmp_func_list) == 0:
        # skip this one
        continue
    tmp_label = any([True if "cwe" in p["code"].lower() else False for p in tmp_func_list])
    # this will be (label, (graph, name of interesting node))
    all_subgraphs_labeled.append((tmp_label, pp))
    
# show some statistics
tmp_labels = [p[0] for p in all_subgraphs_labeled]
from collections import Counter
display(Counter(tmp_labels))


# with open("../SARD_ready/all_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
with open("../SARD_ready/arrayuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/sensitive_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/integer_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/pointuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
    pickle.dump(all_subgraphs_labeled, f)

# processing 5289/5290

Counter({False: 4038, True: 1252})

In [13]:
# all_subgraphs = arrayuse_subgraphs + sensitive_subgraphs + integer_subgraphs + pointuse_subgraphs
# all_subgraphs = arrayuse_subgraphs
all_subgraphs = sensitive_subgraphs
# all_subgraphs = integer_subgraphs
# all_subgraphs = pointuse_subgraphs


all_subgraphs_labeled = []
for pp in all_subgraphs:
    print("\r# processing {}/{}".format(len(all_subgraphs_labeled), len(all_subgraphs)), end="")
    sg, _ = pp
    tmp_func_list = get_function_nodes(sg)
    # assert len(tmp_func_list) >= 1
    if len(tmp_func_list) == 0:
        # skip this one
        continue
    tmp_label = any([True if "cwe" in p["code"].lower() else False for p in tmp_func_list])
    # this will be (label, (graph, name of interesting node))
    all_subgraphs_labeled.append((tmp_label, pp))
    
# show some statistics
tmp_labels = [p[0] for p in all_subgraphs_labeled]
from collections import Counter
display(Counter(tmp_labels))


# with open("../SARD_ready/all_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/arrayuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
with open("../SARD_ready/sensitive_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/integer_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/pointuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
    pickle.dump(all_subgraphs_labeled, f)

# processing 11923/11924

Counter({True: 3786, False: 8138})

In [14]:
# all_subgraphs = arrayuse_subgraphs + sensitive_subgraphs + integer_subgraphs + pointuse_subgraphs
# all_subgraphs = arrayuse_subgraphs
# all_subgraphs = sensitive_subgraphs
all_subgraphs = integer_subgraphs
# all_subgraphs = pointuse_subgraphs


all_subgraphs_labeled = []
for pp in all_subgraphs:
    print("\r# processing {}/{}".format(len(all_subgraphs_labeled), len(all_subgraphs)), end="")
    sg, _ = pp
    tmp_func_list = get_function_nodes(sg)
    # assert len(tmp_func_list) >= 1
    if len(tmp_func_list) == 0:
        # skip this one
        continue
    tmp_label = any([True if "cwe" in p["code"].lower() else False for p in tmp_func_list])
    # this will be (label, (graph, name of interesting node))
    all_subgraphs_labeled.append((tmp_label, pp))
    
# show some statistics
tmp_labels = [p[0] for p in all_subgraphs_labeled]
from collections import Counter
display(Counter(tmp_labels))


# with open("../SARD_ready/all_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/arrayuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/sensitive_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
with open("../SARD_ready/integer_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/pointuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
    pickle.dump(all_subgraphs_labeled, f)

# processing 364/365

Counter({True: 109, False: 256})

In [15]:
# all_subgraphs = arrayuse_subgraphs + sensitive_subgraphs + integer_subgraphs + pointuse_subgraphs
# all_subgraphs = arrayuse_subgraphs
# all_subgraphs = sensitive_subgraphs
# all_subgraphs = integer_subgraphs
all_subgraphs = pointuse_subgraphs


all_subgraphs_labeled = []
for pp in all_subgraphs:
    print("\r# processing {}/{}".format(len(all_subgraphs_labeled), len(all_subgraphs)), end="")
    sg, _ = pp
    tmp_func_list = get_function_nodes(sg)
    # assert len(tmp_func_list) >= 1
    if len(tmp_func_list) == 0:
        # skip this one
        continue
    tmp_label = any([True if "cwe" in p["code"].lower() else False for p in tmp_func_list])
    # this will be (label, (graph, name of interesting node))
    all_subgraphs_labeled.append((tmp_label, pp))
    
# show some statistics
tmp_labels = [p[0] for p in all_subgraphs_labeled]
from collections import Counter
display(Counter(tmp_labels))


# with open("../SARD_ready/all_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/arrayuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/sensitive_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
# with open("../SARD_ready/integer_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
with open("../SARD_ready/pointuse_subgraphs_dir{}.pkl".format(batch_id), "wb") as f:
    pickle.dump(all_subgraphs_labeled, f)

# processing 11722/11723

Counter({False: 10207, True: 1516})

#### view

In [16]:
# tmp_graph, tmp_n = arrayuse_subgraphs[3]
# # tmp_graph = dt[k0][k1]
# for p in tmp_graph.vs:
#     p["label_size"]=10
# visual_style = {}
# visual_style["margin"]=40
# visual_style["bbox"]=(400,400)
# visual_style["vertex_label"] = [
#     "{} \n {}".format(tmp_graph.vs["name"][i], tmp_graph.vs["code"][i]) 
#     for i in range(len(tmp_graph.vs["code"]))
# ]
# plot(tmp_graph, **visual_style)

In [17]:
# tmp_graph.vs[0]