# Joern-ey into ILP

In [1]:
import pandas as pd

In [2]:
ilp_data = pd.read_csv("../data/ilp_dataset.csv.gz")
ilp_data = ilp_data.drop("Unnamed: 0", axis=1)
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
1,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
2,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760
3,62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1680
4,232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,False,1723
5,-62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,True,1618
6,-62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1570
7,-62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,True,1616
8,-62900,000/062/900/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,True,1576
9,-232012,000/232/012/CWE122_Heap_Based_Buffer_Overflow_...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-122,31,True,1619


In [3]:
prolog = pd.read_csv("../data/ilp_prolog_data_withREF.csv.gz")
prolog = prolog.drop("Unnamed: 0", axis=1)
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
0,-232012,CWE-122,True,1619,% START: Generated Prolog\n% NODE PROPERTIES \...
1,-62900,CWE-121,True,1576,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-62869,CWE-121,True,1616,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-62852,CWE-121,True,1570,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62804,CWE-121,True,1618,% START: Generated Prolog\n% NODE PROPERTIES \...
5,62804,CWE-121,False,1722,% START: Generated Prolog\n% NODE PROPERTIES \...
6,62852,CWE-121,False,1674,% START: Generated Prolog\n% NODE PROPERTIES \...
7,62869,CWE-121,False,1760,% START: Generated Prolog\n% NODE PROPERTIES \...
8,62900,CWE-121,False,1680,% START: Generated Prolog\n% NODE PROPERTIES \...
9,232012,CWE-122,False,1723,% START: Generated Prolog\n% NODE PROPERTIES \...


In [4]:
good_examples = prolog[prolog['bug'] == False]
bad_examples = prolog[prolog['bug'] == True]

In [5]:
def extract_node_ids(tree):
    nodes = set()
    
    in_ast_section = False
    
    for line in tree.split('\n'):
        line = line.strip()
        
        if line == '% AST':
            in_ast_section = True
        
        if line == '% CFG':
            in_ast_section = False
            
        if line == '% REF':
            in_ast_section = False
        
        if in_ast_section and not line.startswith("%"):
            parent, child = line[4:-2].split(", ")
            nodes.add(parent)
            nodes.add(child)
            
    return nodes

In [6]:
# good nodes_g
nodes_g = set()

for i in range(len(good_examples)):
    good_nodes = extract_node_ids(good_examples.iloc[i].tree)
    nodes_g = set.union(nodes_g,good_nodes)

    
# bad nodes_b
nodes_b = set()

for i in range(len(bad_examples)):
    bad_nodes = extract_node_ids(bad_examples.iloc[i].tree)
    nodes_b = set.union(nodes_b,bad_nodes)

nodes = set.union(nodes_g,nodes_b)


Manually find the "error" node for each testcase, and tag those as the positive or  negative examples.

In [7]:
types = [
    'node('+node_id+').' for node_id in nodes
]

positive_examples = [
    """
bug(bad_62900_id_120_f_memmove_01_c_l_29_c_8_).
bug(bad_232012_id_146_f_memcpy_01_c_l_30_c_8_).
bug(bad_62869_id_121_f_memcpy_18_c_l_31_c_8_).
bug(bad_62852_id_120_f_memcpy_01_c_l_29_c_8_).
bug(bad_62804_id_110_f_loop_01_c_l_32_c_12_).
    """,
]

negative_examples = [
    """
:- bug(good_62804_id_129_f_loop_01_c_l_34_c_12_).
:- bug(good_62852_id_125_f_memcpy_01_c_l_31_c_8_).
:- bug(good_62869_id_125_f_memcpy_18_c_l_33_c_8_).
:- bug(good_62900_id_125_f_memmove_01_c_l_31_c_8_).
:- bug(good_232012_id_150_f_memcpy_01_c_l_32_c_8_).
    """,
]


In [8]:
script_template = """
% Settings
:- set(c,100)?
:- set(i,100)?
:- set(h,100)?
:- set(noise,0.5)?


% Mode declarations
:- modeh(*, bug(+node))?

:- modeb(*, ast(+node,-node))?
:- modeb(*, ast(-node,+node))?

:- modeb(*, cfg(+node,-node))?
:- modeb(*, cfg(-node,+node))?

:- modeb(*, ref(+node,-node))?
:- modeb(*, ref(-node,+node))?

:- modeb(*, assignment(+node))?
:- modeb(*, compMemberAccess(+node))?
:- modeb(*, sizeOf(+node))?
:- modeb(*, malloc(+node))?
:- modeb(*, alloc(+node))?
:- modeb(*, writeToPointer(+node))?

:- modeb(*, assignment(-node))?
:- modeb(*, compMemberAccess(-node))?
:- modeb(*, sizeOf(-node))?
:- modeb(*, malloc(-node))?
:- modeb(*, alloc(-node))?
:- modeb(*, writeToPointer(-node))?


%% Types
{types}

%% background knowledge
{bg_knowledge}

%% learning task
%% positive examples
{positive_examples}

%% negative examples
{negative_examples}

"""

script = script_template.format(
    types = '\n'.join(types),
    bg_knowledge = '\n'.join(prolog['tree']),
    positive_examples='\n'.join(positive_examples),
    negative_examples='\n'.join(negative_examples),
)

with open("test.pl", "w") as f:
    f.write(script)
    
print(script)


% Settings
:- set(c,100)?
:- set(i,100)?
:- set(h,100)?
:- set(noise,0.5)?


% Mode declarations
:- modeh(*, bug(+node))?

:- modeb(*, ast(+node,-node))?
:- modeb(*, ast(-node,+node))?

:- modeb(*, cfg(+node,-node))?
:- modeb(*, cfg(-node,+node))?

:- modeb(*, ref(+node,-node))?
:- modeb(*, ref(-node,+node))?

:- modeb(*, assignment(+node))?
:- modeb(*, compMemberAccess(+node))?
:- modeb(*, sizeOf(+node))?
:- modeb(*, malloc(+node))?
:- modeb(*, alloc(+node))?
:- modeb(*, writeToPointer(+node))?

:- modeb(*, assignment(-node))?
:- modeb(*, compMemberAccess(-node))?
:- modeb(*, sizeOf(-node))?
:- modeb(*, malloc(-node))?
:- modeb(*, alloc(-node))?
:- modeb(*, writeToPointer(-node))?


%% Types
node(id_176_f_l_c_).
node(id_161_f_memcpy_01_c_l_25_c_25_).
node(id_127_f_memcpy_18_c_l_27_c_18_).
node(id_40_f_l_c_).
node(id_178_f_l_40_c_).
node(id_120_f_memcpy_18_c_l_33_c_32_).
node(id_5_f_l_c_).
node(id_135_f_memcpy_01_c_l_34_c_13_).
node(id_158_f_memcpy_01_c_l_28_c_28_).
node(id_31_f_l_c_)