# Joern-ey into ILP

In [1]:
import pandas as pd

In [2]:
ilp_data = pd.read_csv("../data/ilp_dataset.csv.gz")
ilp_data = ilp_data.drop('Unnamed: 0', axis='columns')

In [3]:
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
1,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
2,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760
3,62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1680
4,232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,False,1723
5,-62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1618
6,-62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1570
7,-62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,True,1616
8,-62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1576
9,-232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,True,1619


Now do it for all our examples:

In [4]:
import os
import subprocess
import tempfile

In [5]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [6]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/joern_cfg_to_prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [7]:
ilp_data.groupby('testcase_ID').apply(generate_prolog)

In [8]:
prolog = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog = prolog[1:11]  # when we run apply it duplicates the first group
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-62900,CWE-121,True,1576,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-62869,CWE-121,True,1616,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62804,CWE-121,True,1618,% START: Generated Prolog\n% NODE PROPERTIES \...
6,62804,CWE-121,False,1722,% START: Generated Prolog\n% NODE PROPERTIES \...
7,62852,CWE-121,False,1674,% START: Generated Prolog\n% NODE PROPERTIES \...
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% NODE PROPERTIES \...
9,62900,CWE-121,False,1680,% START: Generated Prolog\n% NODE PROPERTIES \...
10,232012,CWE-122,False,1723,% START: Generated Prolog\n% NODE PROPERTIES \...


In [10]:
prolog.to_csv('../data/ilp_prolog_data_withREF.csv.gz')

In [34]:
import re

In [35]:
def fix_single_rules(testcase):
    find_node_ids = re.compile('\((\w+)\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1)'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_tree_rules(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2)'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_code_rules(testcase):
    find_node_ids = re.compile('\((\w+), "(.*)"\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, "\\2")'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [36]:
prolog['tree'] = prolog.apply(fix_single_rules, axis='columns')
prolog['tree'] = prolog.apply(fix_tree_rules, axis='columns')
prolog['tree'] = prolog.apply(fix_code_rules, axis='columns')

In [37]:
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-62900,CWE-121,True,1576,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-62869,CWE-121,True,1616,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62804,CWE-121,True,1618,% START: Generated Prolog\n% NODE PROPERTIES \...
6,62804,CWE-121,False,1722,% START: Generated Prolog\n% NODE PROPERTIES \...
7,62852,CWE-121,False,1674,% START: Generated Prolog\n% NODE PROPERTIES \...
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% NODE PROPERTIES \...
9,62900,CWE-121,False,1680,% START: Generated Prolog\n% NODE PROPERTIES \...
10,232012,CWE-122,False,1723,% START: Generated Prolog\n% NODE PROPERTIES \...


In [38]:
def extract_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)
            
    
def remove_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if not in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)

In [39]:
prolog['source_map'] = prolog['tree'].apply(extract_source_map)
prolog['tree'] = prolog['tree'].apply(remove_source_map)

In [40]:
prolog.to_csv("../data/ilp_prolog_data.csv.gz")

In [46]:
print(prolog.iloc[0].tree)

% START: Generated Prolog
% NODE PROPERTIES 
assignment(bad_232012_id_46_f_l_c_).
compMemberAccess(bad_232012_id_48_f_l_c_).
sizeOf(bad_232012_id_50_f_l_c_).
malloc(bad_232012_id_56_f_l_c_).
compMemberAccess(bad_232012_id_133_f_l_c_).
compMemberAccess(bad_232012_id_134_f_memcpy_01_c_l_31_c_21_).
sizeOf(bad_232012_id_138_f_l_c_).
sizeOf(bad_232012_id_139_f_memcpy_01_c_l_30_c_32_).
assignment(bad_232012_id_148_f_memcpy_01_c_l_28_c_12_).
malloc(bad_232012_id_163_f_l_c_).
malloc(bad_232012_id_164_f_memcpy_01_c_l_25_c_18_).
assignment(bad_232012_id_168_f_memcpy_01_c_l_25_c_4_).
assignment(bad_232012_id_171_f_l_c_).
assignment(bad_232012_id_172_f_memcpy_01_c_l_23_c_4_).
% AST
ast(bad_232012_id_184_f_l_c_, bad_232012_id_177_f_l_50_c_).
 ast(bad_232012_id_184_f_l_c_, bad_232012_id_178_f_l_40_c_).
 ast(bad_232012_id_184_f_l_c_, bad_232012_id_179_f_l_30_c_).
 ast(bad_232012_id_184_f_l_c_, bad_232012_id_180_f_l_25_c_).
 ast(bad_232012_id_184_f_l_c_, bad_232012_id_181_f_l_6_c_).
 ast(bad_232012_id

Old, broken metagol generation script below:

In [41]:
good_example = prolog[prolog['testcase_ID'] == 62852].iloc[0]
bad_example = prolog[prolog['testcase_ID'] == -62852].iloc[0]

In [42]:
def extract_node_ids(tree):
    nodes = set()
    
    for line in tree.split('\n'):
        line = line.strip()
        if not line.startswith("%") and line:
            parent, child = line[4:-2].split(", ")
            nodes.add(parent)
            nodes.add(child)
            
    return nodes

In [43]:
good_nodes = extract_node_ids(good_example.tree)
bad_nodes = extract_node_ids(bad_example.tree)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
positive_examples = [
    'bug('+node_id+')' for node_id in bad_nodes
]

negative_examples = [
    'bug('+node_id+')' for node_id in good_nodes
]

base_rules = [
]

meta_rules = [
    'metarule([P,Q],[P,A,B],[[Q,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,B],[R,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,C],[R,C,B]]).',
]

In [None]:
script_template = """
:- use_module(library(metagol)).

%% metagol settings
{settings}

%% background knowledge
{background_knowledge}

%% metarules
{metarules}

%% learning task
:-
  %% positive examples
  Pos = [
    {positive_examples}
  ],
  %% negative examples
  Neg = [
    {negative_examples}
  ],
  learn(Pos,Neg).
"""

script = script_template.format(
    background_knowledge='\n'.join([good_example.tree, bad_example.tree] + base_rules),
    settings="""
    body_pred(ast/2).
    body_pred(cfg/2).
    """,
    metarules='\n'.join(meta_rules),
    positive_examples=',\n'.join(positive_examples),
    negative_examples=',\n'.join(negative_examples),
)
print(script)

In [None]:
with open("test.pl", 'w') as f:
    f.write(script)