In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from transformer import Transformer, TOK_LEN, PAD_TOK, MAX_INST_LEN
from stacksyms import generateDebugLabel, getMaxFrameSize, getFunctions, parseELF

file2funcs = dict() # = parseDirectory(debugFiledir)

#debugFilepath = 'data/cross-compile-dataset/bin/static/gcc/og/parallel'
debugFiledir = 'data/cross-compile-dataset/bin/static/gcc/og'
#debugFiledir = '.'
#debugFilepath = 'data/cross-compile-dataset/bin/static/gcc/og/sum'
debugFilepath = 'data/spec2006/bzip2'

In [None]:
# TODO get frame size from .eh_frame section and compare func.variables against GDB's 'info scope func.name' output
for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    with open(debugFilepath, 'rb') as fstream:
        if fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            functions = parseELF(debugFilepath, validateWithGDB=False)
            for func in functions:
                frameSize = getMaxFrameSize(func)
                #paramDescrs = [f'{var} (bytesize = {var.type.byte_size})' for var in func.parameters]
                #localDescrs = [f'{var} (bytesize = {var.type.byte_size})' for var in func.variables]
                print(f'{func.name} max frame size is {frameSize} bytes')#, params are {paramDescrs} locals are {localDescrs}')
                print()

In [2]:
functions = parseELF(debugFilepath)

02/14/2021 22:29:06 INFO:parseELF:Trying to parse data/spec2006/bzip2 as ELF
02/14/2021 22:29:06 INFO:parseDWARF:File has debug info..
02/14/2021 22:29:06 INFO:parseELF:Found 108 functions.
02/14/2021 22:29:06 INFO:collectFrameInfo:has .eh_frames
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uInt64_from_UInt32s.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uInt64_to_double.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uInt64_isZero.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uInt64_qrm10.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uInt64_toAscii.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function myfeof.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function compressStream.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function uncompressStream.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found fra

02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function generateMTFValues.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function sendMTFValues.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function BZ2_compressBlock.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function makeMaps_d.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function BZ2_decompress.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function fallbackSimpleSort.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function fallbackQSort3.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function fallbackSort.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function mainGtU.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function mainSimpleSort.
02/14/2021 22:29:06 INFO:collectFrameInfo:Found frame info for function mmed3.
02/14/2021 22:29:06 INFO:collectFrameInfo:Fou

In [5]:
for func in functions:
    frame = getMaxFrameSize(func)
    label = generateDebugLabel(func)
    print(f"{func.name} ({frame} by offset / {sum(label)} by size) => {label}")

02/14/2021 22:30:31 INFO:generateDebugLabel:Function sendMTFValues has 39 stack elements out of 43 total.
02/14/2021 22:30:31 INFO:generateDebugLabel:Function generateMTFValues has 14 stack elements out of 16 total.


main (60 by offset / 49 by size) => [4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
addFlagsFromEnvVar (36 by offset / 40 by size) => [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
snocString (24 by offset / 28 by size) => [4, 4, 4, 4, 4, 4, 4]
mkCell (20 by offset / 12 by size) => [4, 4, 4]
myMalloc (20 by offset / 20 by size) => [4, 4, 4, 4, 4]
redundant (12 by offset / 16 by size) => [4, 4, 4, 4]
usage (12 by offset / 16 by size) => [4, 4, 4, 4]
license (12 by offset / 12 by size) => [4, 4, 4]
testf (132 by offset / 117 by size) => [4, 1, 4, 96, 4, 4, 4]
uncompress (148 by offset / 142 by size) => [4, 1, 1, 4, 4, 4, 4, 4, 4, 96, 4, 4, 4, 4]
compress (148 by offset / 136 by size) => [4, 4, 4, 4, 4, 4, 96, 4, 4, 4, 4]
mapSuffix (16 by offset / 28 by size) => [4, 4, 4, 4, 4, 4, 4]
hasSuffix (24 by offset / 28 by size) => [4, 4, 4, 4, 4, 4, 4]
containsDubiousChars (8 by offset / 12 by size) => [4, 4, 4]
applySavedFileAttrToOutputFile (20 by offset / 20 by size) => [4, 4, 4, 4, 4]
applySavedTimeInfoToOutputFile

In [42]:
from stacksyms import locExprHasOffset


def getMinOffLoc(stkElm):
    return min(loc.expr[1] for loc in stkElm.locations)

def getStackLocation(stkElm):
    return [loc.expr for loc in filter(locExprHasOffset, stkElm.locations)]

def getStackElements(func):
    potentialStkElms = func.parameters + func.variables
    stkElms = [stkElm for stkElm in potentialStkElms if any(filter(locExprHasOffset, stkElm.locations))]
    return sorted(stkElms, key=getMinOffLoc)

def generateDebugLabel(stackElements):
    #byOffset = sorted(stackElements, key=lambda stkElm :
    return list(map(getStackLocation, stackElements))

for func in functions:
    stkElms = getStackElements(func)
    if len(stkElms) != lenPotStkElms:
        print(f"{len(stkElms)} != {lenPotStkElms} for {stkElms}")
    #labels  = generateDebugLabel(stkElms)
    #print(func.name+"\n", "\n".join([f"{stkElm.name} => {str(label)}" for stkElm, label in zip(stkElms, labels)]))
    print(f"{func.name} => [{', '.join(stkElm.name+'@ebp%+d'%getMinOffLoc(stkElm) for stkElm in stkElms)}]")

9 != 38 for [<Parameter: argv: Char**>, <LocalVariable: decode: Bool>, <LocalVariable: argList: Cell*>, <LocalVariable: i: Int32>, <LocalVariable: j: Int32>, <LocalVariable: tmp: Char*>, <LocalVariable: aa: Cell*>, <LocalVariable: aa2: Cell*>, <Parameter: argc: typedef int IntNative>]
main => [argv@ebp-60, decode@ebp-53, argList@ebp-52, i@ebp-48, j@ebp-44, tmp@ebp-40, aa@ebp-36, aa2@ebp-32, argc@ebp+0]
7 != 38 for [<LocalVariable: i: Int32>, <LocalVariable: j: Int32>, <LocalVariable: k: Int32>, <LocalVariable: p: Char*>, <LocalVariable: envbase: Char*>, <Parameter: argList: Cell**>, <Parameter: varName: Char*>]
addFlagsFromEnvVar => [i@ebp-36, j@ebp-32, k@ebp-28, p@ebp-24, envbase@ebp-20, argList@ebp+0, varName@ebp+4]
4 != 38 for [<LocalVariable: tmp: Cell*>, <LocalVariable: tmp: Cell*>, <Parameter: root: Cell*>, <Parameter: name: Char*>]
snocString => [tmp@ebp-24, tmp@ebp-20, root@ebp+0, name@ebp+4]
1 != 38 for [<LocalVariable: c: Cell*>]
mkCell => [c@ebp-20]
2 != 38 for [<LocalVariab

In [27]:
types = set()
count = 0
for func in functions:
    print(func.name)
    for stkElm in func.variables:
        if not stkElm.type.is_qualified_type and stkElm.type.byte_size is None:
            print(stkElm.type, stkElm.type.byte_size, stkElm.type.array_count, stkElm.type.element, stkElm.type.is)
        types |= {stkElm.type}
        count += 1
    print()
    #frameSize = getMaxFrameSize(func)
    #paramDescrs = [f'{var} (bytesize = {var.type.array_count})' for var in func.parameters]
    #localDescrs = [f'{var} (bytesize = {var.type.array_count})' for var in func.variables]
    #print(f'{func.name} max frame size is {frameSize} bytes, params are {paramDescrs} locals are {localDescrs}')
    #print()
print(count, len(types))

main
typedef int Int32 None None int
typedef int Int32 None None int
typedef unsigned char Bool None None unsigned char

addFlagsFromEnvVar
typedef int Int32 None None int
typedef int Int32 None None int
typedef int Int32 None None int

snocString

mkCell

myMalloc

redundant

usage

license

testf
typedef unsigned char Bool None None unsigned char

uncompress
typedef int Int32 None None int
typedef int Int32 None None int
typedef unsigned char Bool None None unsigned char
typedef unsigned char Bool None None unsigned char
typedef int IntNative None None int
typedef int IntNative None None int

compress
typedef int Int32 None None int
typedef int Int32 None None int
typedef int IntNative None None int

mapSuffix

hasSuffix
typedef int Int32 None None int
typedef int Int32 None None int

containsDubiousChars

applySavedFileAttrToOutputFile
typedef int IntNative None None int

applySavedTimeInfoToOutputFile
typedef int IntNative None None int

saveInputFileMetaInfo
typedef int IntNative 

In [None]:
quotearg_n_style_mem = next((f for f in functions if f.name == 'quotearg_n_style_mem'), None)
print(quotearg_n_style_mem.frame)

In [None]:
for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    with open(debugFilepath, 'rb') as fstream:
        if not fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            continue
        print('Is ELF, checking for debug info..')
        elf = ELFFile(fstream)
        print(elf.get_machine_arch())

In [None]:
from elftools.elf.elffile import ELFFile
from elftools.dwarf.constants import DW_LNS_copy, DW_LNS_set_file, DW_LNE_define_file, \
                                     DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa
from dwarf_import.model.module import Module
from dwarf_import.io.dwarf_import import DWARFDB, DWARFImporter, place_component_in_module_tree

for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    print(f'Opening {debugFilepath} to check magic..')
    with open(debugFilepath, 'rb') as fstream:
        if not fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            continue
        print('Is ELF, checking for debug info..')
        elf = ELFFile(fstream)
        if not elf.has_dwarf_info():
            print('does not contain DWARF info')
            continue
        print('Has debug info, parsing functions..')
        dwarfInfo = elf.get_dwarf_info()
        module, importer = parseDWARF(elf)
        file2funcs[debugFilepath] = getFunctions(module)
        #handleLineprogram(dwarfInfo)
        collectFrameInfo(file2funcs[debugFilepath], elf)


In [None]:
#from elftools.dwarf.descriptions import describe_reg_name

'''should probably be named getMaxFrameSize'''
def getFrameSize(function):
    # The number we get here statically from the .eh_frame section can actually be validated using GDB:
    # ./gdb path/to/prog
    # (gdb) set confirmation off
    # (gdb) break {func.name}
    # (gdb) r
    # (gdb) rbreak .
    # (gdb) c
    # (gdb) info frame
    #  at this point "frame at 0xADDRESS_A" - "called by frame at 0xADDRESS_B" should match our number below
    funcFrameRegs = [(key, val) for d in function.frame for key, val in d.items() if type(key)==int]
    return abs(min(funcFrameRegs, key=lambda t : t[1].arg)[1].arg)

sumFuncs = file2funcs['data/cross-compile-dataset/bin/static/gcc/og/sum']
sumMain = list(filter(lambda func : func.name == 'main', sumFuncs))[0]
print([f'{func.name} => {getFrameSize(func)}' for func in sumFuncs])
#print([describe_reg_name(regno, 'x64') + " => " + str(rule.arg) for (regno, rule) in sumMainFrameRegs])



In [None]:
#file2funcs = parseDirectory(debugFiledir)

#functions = parseELF(debugFilepath)
#generateFeatures(functions)
#assert(not any(filter(lambda x : PAD_TOK in x, data)))

#main = list(filter(lambda x : x.name=='main', functions))[0]
#main_x = generateFeatures(main)


In [None]:
# TODO: newCollectLocals(gdbOut, scopeQueries, functions) and newCollectDisas(gdbOut, disasQueries, functions)
#for scope, func in zip(scopeResults, functions):
#    collectLocals(scope, functions[func])
for disas, func in zip(gdbOut, functions):
    func.disas = [tuple(line.strip().split('\\t')) for line in disas[1:-1]]

In [None]:
X, Y, Z = [], [], []
for func in functions:
    print('////////////////////////')
    print(func.name, hex(func.start), func.frame_base)
    for lvar in func.variables:
        print(lvar.name, lvar.type, "(bytesize = %d)"%lvar.type.byte_size)
        for loc in lvar.locations:
            print(hex(loc.begin) + " to " + hex(loc.end) + ": " \
            + str(loc.type)[13:]
            + str(loc.expr))
        print('')
#    X += [func] #generateFeature(func, functions)]
#    Y += [generateLabel(func, functions)]
#    Z += [generateDebugLabel(func, functions)]
#print([x+" => "+ str(y) for x,y in zip(X,Z)])

#func = 'quotearg_n_style_colon'

In [None]:
'''Doesnt appear to be super useful for stack symbolization as compilers dont seem to emit relevant information'''
def handleLineprogram(dwarfInfo):
    print('looping through compilation units..')
    for cu in dwarfInfo.iter_CUs():
        lp = dwarfInfo.line_program_for_CU(cu)
        if lp == None:
            print('DWARF info is missing a line program for this CU')
            print(cu.keys())
            continue
        cu_filename = lp['file_entry'][0].name.decode('latin-1')
        if len(lp['include_directory']) > 0:
            dir_index = lp['file_entry'][0].dir_index
            if dir_index > 0:
                idir = lp['include_directory'][dir_index - 1]
            else:
                idir = b'.'
            cu_filename = '%s/%s' % (idir.decode('latin-1'), cu_filename)
        print(f'CU: {cu_filename}')
        #print(f'File name                            Line number    Starting address')
        for entry in lp.get_entries():
            '''
            if entry.state is None:
                # Special handling for commands that don't set a new state
                if entry.command == DW_LNS_set_file:
                    file_entry = lp['file_entry'][entry.args[0] - 1]
                    if file_entry.dir_index == 0:
                        # current directory
                        print('\n./%s:[++]' % (
                            file_entry.name.decode('latin-1')))
                    else:
                        print('\n%s/%s:' % (
                            lp['include_directory'][file_entry.dir_index - 1].decode('latin-1'),
                            file_entry.name.decode('latin-1')))
                elif entry.command == DW_LNE_define_file:
                    print('%s:' % (
                        lp['include_directory'][entry.args[0].dir_index].decode('latin-1')))
                elif entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                    print('=============================')
                    print(repr(entry))
                    print('=============================')
                else:
                    print('=============================')
                    print('OTHER COMMAND: ' + str(entry))
                    print('=============================')
            elif not entry.state.end_sequence:
                # readelf doesn't print the state after end_sequence
                # instructions. I think it's a bug but to be compatible
                # I don't print them too.
                if lp['version'] < 4:
                    print('%-35s  %11d  %18s' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address)))
                else:
                    print('%-35s  %11d  %18s[%d]' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address),
                        state.op_index))
            if entry.command == DW_LNS_copy:
                # Another readelf oddity...
                print()
            '''
            if entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                print('=============================')
                print(repr(entry))
                print('=============================')
        #print(dir(lp))
        #line_entry_mapping(lp)

def line_entry_mapping(line_program):
    import collections
    filename_map = collections.defaultdict(int)
    lp_entries = line_program.get_entries()
    for lpe in lp_entries:
        if not lpe.state or lpe.state.file == 0:
            continue # TODO: instruction doesn't correspond to src
        filename = lpe_filename(line_program, lpe.state.file)
        filename_map[filename] += 1
    for filename, lpe_count in filename_map.items():
        print('%s -> %d entries' % (filename, lpe_count))

def lpe_filename(line_program, file_index):
    lp_header = line_program.header
    print(lp_header)
    file_entries = lp_header["file_entry"]
    file_entry = file_entries[file_index - 1]
    dir_index = file_entry["dir_index"]
    if dir_index == 0:
        return file_entry.name.decode()
    directory = lp_header["include_directory"][dir_index - 1]
    return os.path.join(directory, file_entry.name).decode()

    

#print(elf.get_section_by_name('.text').data())