# Joern-ey into ILP

In [1]:
import pandas as pd

In [2]:
ilp_data = pd.read_csv("../data/ilp_dataset.csv.gz")
ilp_data = ilp_data.drop('Unnamed: 0', axis='columns')

In [3]:
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
1,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
2,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760
3,62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1680
4,232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,False,1723
5,-62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1618
6,-62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1570
7,-62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,True,1616
8,-62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1576
9,-232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,True,1619


Now do it for all our examples:

In [4]:
import os
import subprocess
import tempfile

In [5]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [6]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/joern_cfg_to_prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [7]:
ilp_data.groupby('testcase_ID').apply(generate_prolog)

In [8]:
prolog = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog = prolog[1:10]  # when we run apply it duplicates the first group
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% AST\nast(130_memc...
2,-62900,CWE-121,True,1576,"% START: Generated Prolog\n% AST\nast(149, 68)..."
3,-62869,CWE-121,True,1616,"% START: Generated Prolog\n% AST\nast(150, 143..."
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% AST\nast(80_memcp...
5,-62804,CWE-121,True,1618,"% START: Generated Prolog\n% AST\nast(152, 58)..."
6,62804,CWE-121,False,1722,"% START: Generated Prolog\n% AST\nast(176, 168..."
7,62852,CWE-121,False,1674,"% START: Generated Prolog\n% AST\nast(160, 63)..."
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% AST\nast(81_memcp...
9,62900,CWE-121,False,1680,"% START: Generated Prolog\n% AST\nast(160, 67)..."


In [9]:
import re

In [10]:
def fixprolog(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2)'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [11]:
prolog['tree'] = prolog.apply(fixprolog, axis='columns')

In [14]:
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232012,CWE-122,True,1619,% START: Generated Prolog\n% AST\nast(bad_2320...
2,-62900,CWE-121,True,1576,% START: Generated Prolog\n% AST\nast(bad_6290...
3,-62869,CWE-121,True,1616,% START: Generated Prolog\n% AST\nast(bad_6286...
4,-62852,CWE-121,True,1570,% START: Generated Prolog\n% AST\nast(bad_6285...
5,-62804,CWE-121,True,1618,% START: Generated Prolog\n% AST\nast(bad_6280...
6,62804,CWE-121,False,1722,% START: Generated Prolog\n% AST\nast(good_628...
7,62852,CWE-121,False,1674,% START: Generated Prolog\n% AST\nast(good_628...
8,62869,CWE-121,False,1760,% START: Generated Prolog\n% AST\nast(good_628...
9,62900,CWE-121,False,1680,% START: Generated Prolog\n% AST\nast(good_629...


In [15]:
prolog.to_csv("../data/ilp_prolog_data.csv.gz")

In [16]:
good_example = prolog[prolog['testcase_ID'] == 62852].iloc[0]
bad_example = prolog[prolog['testcase_ID'] == -62852].iloc[0]

In [17]:
def extract_node_ids(tree):
    nodes = set()
    
    for line in tree.split('\n'):
        line = line.strip()
        if not line.startswith("%") and line:
            parent, child = line[4:-2].split(", ")
            nodes.add(parent)
            nodes.add(child)
            
    return nodes

In [18]:
good_nodes = extract_node_ids(good_example.tree)
bad_nodes = extract_node_ids(bad_example.tree)

In [19]:
positive_examples = [
    'bug('+node_id+')' for node_id in bad_nodes
]

negative_examples = [
    'bug('+node_id+')' for node_id in good_nodes
]

base_rules = [
]

meta_rules = [
    'metarule([P,Q],[P,A,B],[[Q,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,B],[R,A,B]]).',
    'metarule([P,Q,R],[P,A,B],[[Q,A,C],[R,C,B]]).',
]

In [20]:
script_template = """
:- use_module(library(metagol)).

%% metagol settings
{settings}

%% background knowledge
{background_knowledge}

%% metarules
{metarules}

%% learning task
:-
  %% positive examples
  Pos = [
    {positive_examples}
  ],
  %% negative examples
  Neg = [
    {negative_examples}
  ],
  learn(Pos,Neg).
"""

script = script_template.format(
    background_knowledge='\n'.join([good_example.tree, bad_example.tree] + base_rules),
    settings="""
    body_pred(ast/2).
    body_pred(cfg/2).
    """,
    metarules='\n'.join(meta_rules),
    positive_examples=',\n'.join(positive_examples),
    negative_examples=',\n'.join(negative_examples),
)
print(script)


:- use_module(library(metagol)).

%% metagol settings

    body_pred(ast/2).
    body_pred(cfg/2).
    

%% background knowledge
% START: Generated Prolog
% AST
ast(good_62852_160, good_62852_63).
 ast(good_62852_160, good_62852_59).
 ast(good_62852_160, good_62852_53).
 ast(good_62852_160, good_62852_49).
 ast(good_62852_160, good_62852_47).
 ast(good_62852_160, good_62852_43).
 ast(good_62852_160, good_62852_35).
 ast(good_62852_160, good_62852_33).
 ast(good_62852_160, good_62852_30).
 ast(good_62852_160, good_62852_27).
 ast(good_62852_160, good_62852_19).
 ast(good_62852_160, good_62852_76).
 ast(good_62852_160, good_62852_75).
 ast(good_62852_160, good_62852_74).
 ast(good_62852_160, good_62852_73).
 ast(good_62852_160, good_62852_72).
 ast(good_62852_160, good_62852_71).
 ast(good_62852_160, good_62852_70).
 ast(good_62852_160, good_62852_69).
 ast(good_62852_100, good_62852_10).
 ast(good_62852_100, good_62852_17).
 ast(good_62852_53, good_62852_16).
 ast(good_62852_59, good_6

In [21]:
with open("test.pl", 'w') as f:
    f.write(script)