In [1]:
import pickle
from igraph import *

with open("./SARD_ready/all_subgraphs_all_e3.pkl", "rb") as f:
    dt = pickle.load(f)

display(len(dt))

413462

In [2]:
dt[0]

(False, (<igraph.Graph at 0x7f8a58da54f0>, '411764'))

In [3]:
dlabel, (dgraph, dtarget) = dt[0]

In [4]:
dgraph.vs['name'].index(dtarget)

0

### collect a vocabulary, and determine the max code length for every node

In [5]:
tmp_tokens = []
tmp_lengths = []
for p in dt:
    dlabel, (dgraph, dtarget) = p
    for q in dgraph.vs["code"]:
        tmp_tokens += q.split(" ")
        tmp_lengths.append(len(q.split(" ")))

In [6]:
len(tmp_tokens), len(tmp_lengths)

(23338272, 3402810)

In [7]:
from collections import Counter
counter_tokens = Counter(tmp_tokens)
counter_lengths = Counter(tmp_lengths)

In [8]:
counter_tokens.most_common()[-1]

('CWE252_Unchecked_Return_Value__char_fputs_09_bad', 1)

In [9]:
token_vocab = []
sorted_tokens = counter_tokens.most_common()
for p in sorted_tokens:
    # don't include CWE keywords
    if "CWE" in p[0]:
        continue
    # frequency threshold
    # potentially eliminate variable names
    if p[1] <= 5:
        continue
    token_vocab.append(p[0])

len(token_vocab)

11742

In [10]:
token_vocab[:20]

['(',
 ')',
 '=',
 ',',
 '*',
 ';',
 ']',
 '[',
 'data',
 'char',
 '0',
 '1',
 'numWritten',
 '->',
 '2',
 '100',
 '&',
 '==',
 'service',
 'wchar_t']

In [11]:
import statistics
statistics.median(counter_lengths.most_common())

(65, 1123)

In [12]:
counter_lengths.most_common()

[(3, 605609),
 (1, 520353),
 (4, 462528),
 (6, 258473),
 (8, 243558),
 (10, 207853),
 (5, 203237),
 (2, 181914),
 (7, 125999),
 (9, 75585),
 (15, 65791),
 (14, 65604),
 (25, 56071),
 (19, 53416),
 (12, 51240),
 (13, 34602),
 (11, 28514),
 (18, 27770),
 (16, 26299),
 (31, 17605),
 (17, 15828),
 (20, 9954),
 (22, 7956),
 (27, 5599),
 (24, 5535),
 (28, 4976),
 (21, 4327),
 (32, 3462),
 (23, 3278),
 (37, 2012),
 (30, 1738),
 (41, 1679),
 (26, 1504),
 (52, 1366),
 (34, 1185),
 (65, 1123),
 (29, 1105),
 (36, 1094),
 (43, 1053),
 (40, 892),
 (44, 809),
 (33, 805),
 (42, 782),
 (68, 774),
 (76, 770),
 (392, 756),
 (116, 756),
 (104, 756),
 (47, 615),
 (51, 590),
 (35, 589),
 (39, 535),
 (38, 461),
 (45, 457),
 (87, 449),
 (50, 403),
 (48, 399),
 (63, 375),
 (61, 351),
 (57, 301),
 (86, 252),
 (58, 235),
 (62, 231),
 (60, 227),
 (67, 213),
 (46, 153),
 (69, 132),
 (55, 118),
 (49, 116),
 (122, 110),
 (764, 100),
 (64, 97),
 (1785, 95),
 (66, 94),
 (75, 94),
 (83, 90),
 (72, 80),
 (1099, 80),
 (

In [13]:
MAX_CODE_LEN = 120
token_list = ["<PAD>", "<UNK>"] + token_vocab
token_dict = {token_list[i]:i for i in range(len(token_list))}

### start processing the dataset to fit PyTorch Geometric Data
- train: 70%
- valid: 10%
- test: 20%

In [17]:
import torch
import random
from torch_geometric.data import Data
mask_pool = [0,0,0,0,0,0,0,1,2,2] # 0: train, 1: valid, 2: test

In [18]:
data_list = []
for p in dt:
    print("\r# processing {}/{}".format(len(data_list), len(dt)), end="")
    dlabel, (dgraph, dtarget) = p
    dtarget_index = dgraph.vs["name"].index(dtarget)
    # construct edge matrix
    edge_index_from = []
    edge_index_to = []
    for q in dgraph.es:
        edge_index_from.append(q.source)
        edge_index_to.append(q.target)
    # construct feature matrix (x)
    feature_x = []
    for q in dgraph.vs['code']:
        tmp_s = q.split(' ') + ["<PAD>" for _ in range(MAX_CODE_LEN)]
        tmp_i = [
            token_dict[r] if r in token_dict.keys() else token_dict["<UNK>"]
            for r in tmp_s[:MAX_CODE_LEN]
        ]
        feature_x.append(tmp_i)
    # construct mask
    # only include the target node, others will be masked out for all mask types
    mtype = random.choice(mask_pool)
    train_mask = [False for _ in range(len(dgraph.vs))]
    val_mask = [False for _ in range(len(dgraph.vs))]
    test_mask = [False for _ in range(len(dgraph.vs))]
    # literally all nodes from one graph follows the label of the target node
    label_y = [1 if dlabel else 0 for _ in range(len(dgraph.vs))]
    if mtype==0:
        # train
        train_mask[dtarget_index] = True
    elif mtype==1:
        # valid
        val_mask[dtarget_index] = True
    elif mtype==2:
        # test
        test_mask[dtarget_index] = True
    else:
        # how did you get here?
        raise Exception("How did you get here?")
    # then construct Data
    tmp_data = Data(
        # x=torch.tensor(feature_x, dtype=torch.long),
        x=torch.tensor(feature_x, dtype=torch.float),
        y=torch.tensor(label_y, dtype=torch.long),
        edge_index=torch.tensor([edge_index_from, edge_index_to], dtype=torch.long),
        train_mask=torch.tensor(train_mask, dtype=torch.bool),
        val_mask=torch.tensor(val_mask, dtype=torch.bool),
        test_mask=torch.tensor(test_mask, dtype=torch.bool),
    )
    data_list.append(tmp_data)
    

# processing 413461/413462

In [19]:
with open("./SySeVR_GraphDataset_e3.pkl", "wb") as f:
    pickle.dump(data_list, f)