# Joern-ey into ILP

In [16]:
import pandas as pd

In [17]:
ilp_data = pd.read_csv("../data/ilp_dataset.csv.gz")
ilp_data = ilp_data.drop('Unnamed: 0', axis='columns')

In [18]:
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
1,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
2,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760
3,62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1680
4,232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,False,1723
5,-62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1618
6,-62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1570
7,-62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,True,1616
8,-62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1576
9,-232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,True,1619


Now do it for all our examples:

In [19]:
import os
import subprocess
import tempfile

In [20]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [21]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/joern_cfg_to_prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [22]:
ilp_data.groupby('testcase_ID').apply(generate_prolog)

In [23]:
prolog = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog = prolog[1:11]  # when we run apply it duplicates the first group
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% CODE\nsource_code...
2,-62900,CWE-121,True,1576,% START: Generated Prolog\n% CODE\nsource_code...
3,-62869,CWE-121,True,1616,% START: Generated Prolog\n% CODE\nsource_code...
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% CODE\nsource_code...
5,-62804,CWE-121,True,1618,% START: Generated Prolog\n% CODE\nsource_code...
6,62804,CWE-121,False,1722,% START: Generated Prolog\n% CODE\nsource_code...
7,62852,CWE-121,False,1674,% START: Generated Prolog\n% CODE\nsource_code...
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% CODE\nsource_code...
9,62900,CWE-121,False,1680,% START: Generated Prolog\n% CODE\nsource_code...
10,232012,CWE-122,False,1723,% START: Generated Prolog\n% CODE\nsource_code...


In [24]:
import re

In [25]:
def fix_tree_rules(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2)'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_code_rules(testcase):
    find_node_ids = re.compile('\((\w+), "(\w+)"\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, "\\2")'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [26]:
prolog['tree'] = prolog.apply(fix_tree_rules, axis='columns')
prolog['tree'] = prolog.apply(fix_code_rules, axis='columns')

In [27]:
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% CODE\nsource_code...
2,-62900,CWE-121,True,1576,% START: Generated Prolog\n% CODE\nsource_code...
3,-62869,CWE-121,True,1616,% START: Generated Prolog\n% CODE\nsource_code...
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% CODE\nsource_code...
5,-62804,CWE-121,True,1618,% START: Generated Prolog\n% CODE\nsource_code...
6,62804,CWE-121,False,1722,% START: Generated Prolog\n% CODE\nsource_code...
7,62852,CWE-121,False,1674,% START: Generated Prolog\n% CODE\nsource_code...
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% CODE\nsource_code...
9,62900,CWE-121,False,1680,% START: Generated Prolog\n% CODE\nsource_code...
10,232012,CWE-122,False,1723,% START: Generated Prolog\n% CODE\nsource_code...


In [38]:
def extract_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
        if line == "% AST":
            in_source_code_section = False
        
        if in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)
            
    
def remove_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
        if line == "% AST":
            in_source_code_section = False
        
        if not in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)

In [39]:
prolog['source_map'] = prolog['tree'].apply(extract_source_map)
prolog['tree'] = prolog['tree'].apply(remove_source_map)

In [43]:
prolog.to_csv("../data/ilp_prolog_data.csv.gz")

In [54]:
print(prolog.iloc[0].source_map)

% CODE
source_code(id_1_f_l_48_c_19_, "char * argv[]").
source_code(bad_232012_id_2_f_l_c_, "p1").
source_code(bad_232012_id_3_f_l_c_, "p1").
source_code(bad_232012_id_4_f_l_c_, "p3").
source_code(bad_232012_id_5_f_l_c_, "p1").
source_code(bad_232012_id_6_f_l_c_, "p2").
source_code(bad_232012_id_7_f_l_c_, "p1").
source_code(bad_232012_id_8_f_l_c_, "p1").
source_code(bad_232012_id_9_f_l_c_, "p1").
source_code(bad_232012_id_10_f_l_c_, "p1").
source_code(bad_232012_id_11_f_l_c_, "p1").
source_code(bad_232012_id_12_f_l_c_, "p1").
source_code(bad_232012_id_13_f_l_c_, "p1").
source_code(bad_232012_id_14_f_l_c_, "p1").
source_code(bad_232012_id_15_f_l_c_, "p2").
source_code(bad_232012_id_16_f_l_c_, "p2").
source_code(bad_232012_id_17_f_l_c_, "p1").
source_code(bad_232012_id_18_f_l_c_, "p1").
source_code(bad_232012_id_19_f_l_c_, "p2").
source_code(id_20_f_l_48_c_9_, "int argc").
source_code(bad_232012_id_21_f_l_c_, "p2").
source_code(bad_232012_id_22_f_l_c_, "p1").
source_code(bad_232012_id_23

In [44]:
good_example = prolog[prolog['testcase_ID'] == 62852].iloc[0]
bad_example = prolog[prolog['testcase_ID'] == -62852].iloc[0]

In [45]:
def extract_node_ids(tree):
    nodes = set()
    
    for line in tree.split('\n'):
        line = line.strip()
        if not line.startswith("%") and line:
            parent, child = line[4:-2].split(", ")
            nodes.add(parent)
            nodes.add(child)
            
    return nodes

In [46]:
good_nodes = extract_node_ids(good_example.tree)
bad_nodes = extract_node_ids(bad_example.tree)

In [47]:
positive_examples = [
    'bug('+node_id+')' for node_id in bad_nodes
]

negative_examples = [
    'bug('+node_id+')' for node_id in good_nodes
]

base_rules = [
]

meta_rules = [
    'metarule([P,Q],[P,A,B],[[Q,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,B],[R,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,C],[R,C,B]]).',
]

In [48]:
script_template = """
:- use_module(library(metagol)).

%% metagol settings
{settings}

%% background knowledge
{background_knowledge}

%% metarules
{metarules}

%% learning task
:-
  %% positive examples
  Pos = [
    {positive_examples}
  ],
  %% negative examples
  Neg = [
    {negative_examples}
  ],
  learn(Pos,Neg).
"""

script = script_template.format(
    background_knowledge='\n'.join([good_example.tree, bad_example.tree] + base_rules),
    settings="""
    body_pred(ast/2).
    body_pred(cfg/2).
    """,
    metarules='\n'.join(meta_rules),
    positive_examples=',\n'.join(positive_examples),
    negative_examples=',\n'.join(negative_examples),
)
print(script)


:- use_module(library(metagol)).

%% metagol settings

    body_pred(ast/2).
    body_pred(cfg/2).
    

%% background knowledge
% START: Generated Prolog
% AST
ast(good_62852_id_81_f_memcpy_01_c_l_60_c_4_, good_62852_id_80_f_memcpy_01_c_l_60_c_11_).
 ast(good_62852_id_96_f_memcpy_01_c_l_53_c_0_, good_62852_id_81_f_memcpy_01_c_l_60_c_4_).
 ast(good_62852_id_83_f_memcpy_01_c_l_58_c_4_, good_62852_id_82_f_memcpy_01_c_l_58_c_14_).
 ast(good_62852_id_96_f_memcpy_01_c_l_53_c_0_, good_62852_id_83_f_memcpy_01_c_l_58_c_4_).
 ast(good_62852_id_96_f_memcpy_01_c_l_53_c_0_, good_62852_id_85_f_memcpy_01_c_l_57_c_4_).
 ast(good_62852_id_88_f_memcpy_01_c_l_56_c_4_, good_62852_id_86_f_memcpy_01_c_l_56_c_14_).
 ast(good_62852_id_96_f_memcpy_01_c_l_53_c_0_, good_62852_id_88_f_memcpy_01_c_l_56_c_4_).
 ast(good_62852_id_91_f_memcpy_01_c_l_54_c_21_, good_62852_id_89_f_memcpy_01_c_l_54_c_26_).
 ast(good_62852_id_93_f_memcpy_01_c_l_54_c_11_, good_62852_id_91_f_memcpy_01_c_l_54_c_21_).
 ast(good_62852_id_93_

In [49]:
with open("test.pl", 'w') as f:
    f.write(script)