In [13]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

from stacksyms import parseELF, parseDWARF, getAllFunctions, collectFrameInfo, \
                      assign_frames, processRegisterRuleExpressions, propagateTypeInfo, \
                      collectOpcodes, collectDisassemblyObjdump, checkLabels

#filepath = 'data/binary/x86/gcc-32-O1-utillinux-unshare'
#filepath = 'data/cross-compile-dataset/bin/static/gcc/o1/pee'
#filepath = 'data/cross-compile-dataset/bin/static/gcc/og/parallel'
#filepath = 'data/cross-compile-dataset/bin/dynamic/clang/o3/xrdb'
#filepath = 'data/cross-compile-dataset/bin/static/gcc/og/sum'
filepath = 'data/spec2006/bzip2'

In [14]:
functions = parseELF(filepath)
print(len(functions))

05/16/2021 21:19:39 INFO:parseELF:Trying to parse data/spec2006/bzip2 as ELF
05/16/2021 21:19:39 INFO:parseELF:ELF file is for architecture x86.
05/16/2021 21:19:39 INFO:parseDWARF:ELF file says it has some frame info..
05/16/2021 21:19:39 INFO:getFunctionsFromSymtab:Trying to obtain symbol information.
05/16/2021 21:19:39 INFO:getFunctionsFromSymtab:Found 127 functions according to symbol table
05/16/2021 21:19:39 INFO:getFunctionsFromDWARFInfo:Searching for subroutines with explicit DWARF info..
05/16/2021 21:19:39 INFO:collectFrameInfo:has .eh_frames
05/16/2021 21:19:39 CRITICAL:assign_frames:No frame table for deregister_tm_clones@0x80495a0!
05/16/2021 21:19:39 CRITICAL:assign_frames:No frame table for register_tm_clones@0x80495e0!
05/16/2021 21:19:39 CRITICAL:assign_frames:No frame table for __do_global_dtors_aux@0x8049620!
05/16/2021 21:19:39 CRITICAL:assign_frames:No frame table for frame_dummy@0x8049650!
05/16/2021 21:19:39 CRITICAL:assign_frames:No frame table for strstr@@GLIB



127


In [15]:
#import logging; logging.getLogger().setLevel(logging.DEBUG)
from stacksyms import getMaxFrameSize, getMaxFrameSizeCFA, generateDebugLabel, getStackElements, getStackLocations, getMaxStackOff
doesntwork = []
for func in functions:
    cfa   = getMaxFrameSizeCFA(func)
    frame = getMaxFrameSize(func)
    label = generateDebugLabel(func)
    if None in label:
        print("BUG!!!!!!!!", func.name)
        print([(stkElm, stkElm.type.qualified_name, stkElm.type.byte_size) for stkElm in getStackElements(func)])
        break
    if sum(label) != frame:
        doesntwork += [func]
    print(f"{func.name} (frame starts at {func.frame_base}, size is {cfa}/{frame}/{sum(label)} bytes by cfa/offset/sum) => {label}")
    if 0<len(func.inlined_functions):
        print("    inlines ", func.inlined_functions)
        for inlined in func.inlined_functions:
            print("        ", inlined.name, getMaxFrameSize(inlined), generateDebugLabel(inlined))
        break

05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for deregister_tm_clones.. returning zero!
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for register_tm_clones.. returning zero!
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for __do_global_dtors_aux.. returning zero!
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for frame_dummy.. returning zero!
05/16/2021 21:19:45 INFO:generateDebugLabel:Function generateMTFValues has 14 stack elements out of 16 total.
05/16/2021 21:19:45 INFO:generateDebugLabel:Function sendMTFValues has 39 stack elements out of 43 total.
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for strstr@@GLIBC_2.0.. returning zero!
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for __x86.get_pc_thunk.bx.. returning zero!
05/16/2021 21:19:45 CRITICAL:getMaxFrameSizeCFA:No canonical frame address for _fini.. r

deregister_tm_clones (frame starts at None, size is 0/0/0 bytes by cfa/offset/sum) => []
register_tm_clones (frame starts at None, size is 0/0/0 bytes by cfa/offset/sum) => []
__do_global_dtors_aux (frame starts at None, size is 0/0/0 bytes by cfa/offset/sum) => []
frame_dummy (frame starts at None, size is 0/0/0 bytes by cfa/offset/sum) => []
uInt64_from_UInt32s (frame starts at None, size is 8/8/24 bytes by cfa/offset/sum) => [8, 4, 4, 4, 4]
uInt64_to_double (frame starts at None, size is 8/28/36 bytes by cfa/offset/sum) => [8, 4, 4, 8, 8, 4]
uInt64_isZero (frame starts at None, size is 8/12/20 bytes by cfa/offset/sum) => [8, 4, 4, 4]
uInt64_qrm10 (frame starts at None, size is 8/20/28 bytes by cfa/offset/sum) => [8, 4, 4, 4, 4, 4]
uInt64_toAscii (frame starts at None, size is 8/88/69 bytes by cfa/offset/sum) => [4, 4, 32, 8, 4, 4, 4, 1, 8]
myfeof (frame starts at None, size is 8/20/164 bytes by cfa/offset/sum) => [148, 4, 4, 4, 4]
compressStream (frame starts at None, size is 8/5176

In [None]:
# In principle, on x86 we can read out the stack size of a function from the
# .eh_frame section as follows:
# 1) "nm elf_binary | grep function_name" -> note address
# 2) "readelf --debug-dump=frames-interp | grep -A10 'address..'"
#    -> read off maximum stack offset from last entry under CFA column
# The problem is that this doesn't work on architectures that do not store the
# return address on the stack and even on x86 some compiler passes may result in
# code that will try to hold the return address in a register, even though it is
# stored on the stack (e.g., "gcc-32-O1-utillinux-unshare")
from elftools.elf.elffile import ELFFile

elf = ELFFile(open(filepath, 'rb'))
module, importer = parseDWARF(elf)
func_dict = getAllFunctions(module, importer, elf)

In [None]:
frame_tables = collectFrameInfo(func_dict, elf)
func_dict = assign_frames(frame_tables, func_dict)
func_dict = processRegisterRuleExpressions(func_dict, importer)
func_dict = propagateTypeInfo(func_dict, importer)
func_dict = collectOpcodes(func_dict, elf)
#labels = checkLabels(functions)

In [None]:
works = [func for func in functions if func not in doesntwork]
works

In [None]:
func = next(filter(lambda func : func.name=='_dl_sort_fini', functions))
func_stack = sorted(getStackElements(func), key=getMaxStackOff)
print(func_stack, generateDebugLabel(func), getMaxFrameSize(func), sum(generateDebugLabel(func)))
print(func.registers)
#quotearg_n_style_stack[4]


In [None]:
quotearg_n_style = next(filter(lambda func : func.name=='quotearg_n_style', functions))
quotearg_n_style_stack = sorted(getStackElements(quotearg_n_style), key=getMaxStackOff)
print(quotearg_n_style_stack)
#getStackLocations(quotearg_n_style_stack[0])
quotearg_n_style_stack[4]


In [None]:
import os
import json
from stacksyms import parseELF, checkLabels
dataroot  = "data/cross-compile-dataset/bin/static/gcc/o2"
outputdir = "data/cross-compile-dataset/labels/static/gcc/o2"
f = 'yes'
filepath  = os.path.join(dataroot, f)
functions = parseELF(filepath)
allLabels = checkLabels(functions)
#if not os.path.exists(outputdir):
#    os.makedirs(outputdir, exist_ok=True)
#with open(os.path.join(outputdir, f), 'w') as cf:
#    json.dump(allLabels, cf)

In [None]:
hex(4208045)

In [None]:
#PAD_TOK8  = 0xa1 # 1 byte isn't super reliable.. might actually appear in instructions
PAD_TOK16 =  0x0f78 # vmread (0x0f79 is vmwrite, could also use 'UD2' 0x0f0b)
TOK_LEN = 2 # a token is two bytes
PAD_TOK = PAD_TOK16
MAX_INST_LEN = 16 # pretty sure 128 bits is the limit on x86

def tokenize(features):
    return [[int(x[i:i+2*TOK_LEN], 16) for i in range(0,len(x),2*TOK_LEN)] for x in features]

def funcFeatures(function): # features are instructions (16 bytes max)
    return tokenize(map(lambda t : t[1].replace(' ', ''), function.disas))

def generatePositionalEncodings(function): # code addresses
    return list(map(lambda t : t[0][0:18], function.disas))

for func in functions:
    if 0<len(func.disas):
        print()

In [None]:
X, Y, Z = [], [], []
for func in functions:
    print('////////////////////////')
    print(func.name, hex(func.start), func.frame_base)
    for lvar in func.variables:
        print(lvar.name, lvar.type, "(bytesize = %d)"%lvar.type.byte_size)
        for loc in lvar.locations:
            print(hex(loc.begin) + " to " + hex(loc.end) + ": " \
            + str(loc.type)[13:]
            + str(loc.expr))
        print('')
#    X += [func] #generateFeature(func, functions)]
#    Y += [generateLabel(func, functions)]
#    Z += [generateDebugLabel(func, functions)]
#print([x+" => "+ str(y) for x,y in zip(X,Z)])

#func = 'quotearg_n_style_colon'

In [None]:
'''Doesnt appear to be super useful for stack symbolization as compilers dont seem to emit relevant information'''
def handleLineprogram(dwarfInfo):
    print('looping through compilation units..')
    for cu in dwarfInfo.iter_CUs():
        lp = dwarfInfo.line_program_for_CU(cu)
        if lp == None:
            print('DWARF info is missing a line program for this CU')
            print(cu.keys())
            continue
        cu_filename = lp['file_entry'][0].name.decode('latin-1')
        if len(lp['include_directory']) > 0:
            dir_index = lp['file_entry'][0].dir_index
            if dir_index > 0:
                idir = lp['include_directory'][dir_index - 1]
            else:
                idir = b'.'
            cu_filename = '%s/%s' % (idir.decode('latin-1'), cu_filename)
        print(f'CU: {cu_filename}')
        #print(f'File name                            Line number    Starting address')
        for entry in lp.get_entries():
            '''
            if entry.state is None:
                # Special handling for commands that don't set a new state
                if entry.command == DW_LNS_set_file:
                    file_entry = lp['file_entry'][entry.args[0] - 1]
                    if file_entry.dir_index == 0:
                        # current directory
                        print('\n./%s:[++]' % (
                            file_entry.name.decode('latin-1')))
                    else:
                        print('\n%s/%s:' % (
                            lp['include_directory'][file_entry.dir_index - 1].decode('latin-1'),
                            file_entry.name.decode('latin-1')))
                elif entry.command == DW_LNE_define_file:
                    print('%s:' % (
                        lp['include_directory'][entry.args[0].dir_index].decode('latin-1')))
                elif entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                    print('=============================')
                    print(repr(entry))
                    print('=============================')
                else:
                    print('=============================')
                    print('OTHER COMMAND: ' + str(entry))
                    print('=============================')
            elif not entry.state.end_sequence:
                # readelf doesn't print the state after end_sequence
                # instructions. I think it's a bug but to be compatible
                # I don't print them too.
                if lp['version'] < 4:
                    print('%-35s  %11d  %18s' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address)))
                else:
                    print('%-35s  %11d  %18s[%d]' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address),
                        state.op_index))
            if entry.command == DW_LNS_copy:
                # Another readelf oddity...
                print()
            '''
            if entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                print('=============================')
                print(repr(entry))
                print('=============================')
        #print(dir(lp))
        #line_entry_mapping(lp)

def line_entry_mapping(line_program):
    import collections
    filename_map = collections.defaultdict(int)
    lp_entries = line_program.get_entries()
    for lpe in lp_entries:
        if not lpe.state or lpe.state.file == 0:
            continue # TODO: instruction doesn't correspond to src
        filename = lpe_filename(line_program, lpe.state.file)
        filename_map[filename] += 1
    for filename, lpe_count in filename_map.items():
        print('%s -> %d entries' % (filename, lpe_count))

def lpe_filename(line_program, file_index):
    lp_header = line_program.header
    print(lp_header)
    file_entries = lp_header["file_entry"]
    file_entry = file_entries[file_index - 1]
    dir_index = file_entry["dir_index"]
    if dir_index == 0:
        return file_entry.name.decode()
    directory = lp_header["include_directory"][dir_index - 1]
    return os.path.join(directory, file_entry.name).decode()

    

#print(elf.get_section_by_name('.text').data())