# Joern-ey into ILP

In [1]:
import pandas as pd

In [2]:
ilp_data = pd.read_csv("../data/ilp_dataset.csv.gz")
ilp_data = ilp_data.drop('Unnamed: 0', axis='columns')

In [3]:
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
1,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
2,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760
3,62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1680
4,232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,False,1723
5,-62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1618
6,-62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1570
7,-62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,True,1616
8,-62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1576
9,-232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,True,1619


Now do it for all our examples:

In [4]:
import os
import subprocess
import tempfile

In [5]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [6]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/joern_cfg_to_prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [7]:
ilp_data.groupby('testcase_ID').apply(generate_prolog)

In [9]:
prolog = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog = prolog[1:10]  # when we run apply it duplicates the first group
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,"% START: Generated Prolog\n% AST\nast(79, 39)...."
2,-62900,CWE-121,True,1576,"% START: Generated Prolog\n% AST\nast(60, 50)...."
3,-62869,CWE-121,True,1616,"% START: Generated Prolog\n% AST\nast(60, 56)...."
4,-62852,CWE-121,True,1570,"% START: Generated Prolog\n% AST\nast(23, 29)...."
5,-62804,CWE-121,True,1618,"% START: Generated Prolog\n% AST\nast(58, 57)...."
6,62804,CWE-121,False,1722,"% START: Generated Prolog\n% AST\nast(71, 25)...."
7,62852,CWE-121,False,1674,"% START: Generated Prolog\n% AST\nast(41, 59)...."
8,62869,CWE-121,False,1760,"% START: Generated Prolog\n% AST\nast(52, 23)...."
9,62900,CWE-121,False,1680,"% START: Generated Prolog\n% AST\nast(53, 54)...."


In [11]:
prolog = pd.read_csv("../data/ilp_prolog_data.csv.gz")

In [14]:
import re

In [18]:
def fixprolog(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2)'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [19]:
list(map(fixprolog,prolog['tree']))

AttributeError: 'str' object has no attribute 'bug'

In [21]:
prolog['tree'] = prolog.apply(fixprolog, axis='columns')

In [104]:
good_example = prolog[prolog['bug'] == False]
bad_example = prolog[prolog['bug'] == True]

In [39]:
def extract_node_ids(tree):
    nodes = set()
    
    for line in tree.split('\n'):
        line = line.strip()
        if not line.startswith("%") and line:
            parent, child = line[4:-2].split(", ")
            nodes.add(parent)
            nodes.add(child)
            
    return nodes

In [65]:
# good nodes_g
nodes_g = set()

for i in range(len(good_example)):
    good_nodes = extract_node_ids(good_example.iloc[i].tree)
    nodes_g = set.union(nodes_g,good_nodes)

# bad nodes_b
nodes_b = set()


for i in range(len(bad_example)):
    bad_nodes = extract_node_ids(bad_example.iloc[i].tree)
    nodes_b = set.union(nodes_b,bad_nodes)

nodes = set.union(nodes_g,nodes_b)


In [115]:
types = [
    'node('+node_id+')' for node_id in nodes
]
positive_examples = [
    'bug('+node_id+')' for node_id in nodes_b
]

negative_examples = [
    'bug('+node_id+')' for node_id in nodes_g
]

base_rules = [
    'bug(A) :- bug(B), ast(A,B).',
    'bug(A) :- bug(B), ast(B,A).',
    'bug(A) :- bug(B), cfg(A,B).',
    'bug(A) :- bug(B), cfg(B,A).',
]


In [105]:
good_examples = [good_example.iloc[i].tree for i in range(4)]
bad_examples = [bad_example.iloc[i].tree for i in range(4)]

In [114]:
script_template = """
% Settings
{settings}


% Mode declarations
:- modeh(*,bug(+node))?
:- modeb(*,ast(+node,-node))?
:- modeb(*,ast(-node,+node))?
:- modeb(*,cfg(+node,-node))?
:- modeb(*,cfg(-node,+node))?

%% Types
{types}

%% background knowledge
{base_rules}
{bg_knowledge}



%% learning task
%% positive examples
{positive_examples}

%% negative examples
:-{negative_examples}

"""

script = script_template.format(
    settings="""
    %set(posonly)
    """,
    types = '.\n'.join(types),
    base_rules =base_rules,
    bg_knowledge = '.\n'.join([good_examples[0],bad_examples[0]]),
    #metarules='\n'.join(meta_rules),
    positive_examples='.\n'.join(positive_examples),
    negative_examples='.\n :-'.join(negative_examples),
)
print(script)


% Settings

    %set(posonly)
    


% Mode declarations
:- modeh(*,bug(+node))?
:- modeb(*,ast(+node,-node))?
:- modeb(*,ast(-node,+node))?
:- modeb(*,cfg(+node,-node))?
:- modeb(*,cfg(-node,+node))?

%% Types
node(bad_62869_24).
node(bad_62869_134_memcpy_18_c_24_4).
node(good_62804_38).
node(bad_62869_131_memcpy_18_c_27_4).
node(bad_62852_132_memcpy_01_c_25_4).
node(good_62804_147_loop_01_c_27_35).
node(bad_62804_36).
node(bad_62900_43).
node(good_62900_31).
node(good_62869_150_memcpy_18_c_24_0).
node(good_62804_85_loop_01_c_64_4).
node(bad_62804_80_loop_01_c_54_14).
node(bad_62804_105_loop_01_c_32_22).
node(bad_62869_149).
node(good_62900_113_memmove_01_c_32_21).
node(bad_232012_18).
node(good_62900_133_memmove_01_c_27_25).
node(good_62869_158).
node(good_62900_152).
node(bad_62900_55).
node(bad_232012_34).
node(bad_232012_69).
node(bad_62900_46).
node(bad_62852_106_memcpy_01_c_30_21).
node(bad_62900_32).
node(good_62869_87_memcpy_18_c_58_14).
node(good_62869_109).
node(good_62804_