In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from transformer import Transformer, TOK_LEN, PAD_TOK, MAX_INST_LEN
from stacksyms import getFunctions, parseELF, parseDWARF, parseDirectory, collectFrameInfo

file2funcs = dict() # = parseDirectory(debugFiledir)

#debugFilepath = 'data/cross-compile-dataset/bin/static/gcc/og/parallel'
debugFiledir = 'data/cross-compile-dataset/bin/static/gcc/og'
#debugFiledir = '.'
#debugFilepath = 'data/cross-compile-dataset/bin/static/gcc/og/sum'

In [None]:
#file2funcs = parseDirectory(debugFiledir)

#functions = parseELF(debugFilepath)
#generateFeatures(functions)
#assert(not any(filter(lambda x : PAD_TOK in x, data)))

#main = list(filter(lambda x : x.name=='main', functions))[0]
#main_x = generateFeatures(main)


In [None]:
# TODO get frame size from .eh_frame section and compare func.variables against GDB's 'info scope func.name' output
for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    with open(debugFilepath, 'rb') as fstream:
        if not fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            continue
    functions = parseELF(debugFilepath)
    for func in functions: # TODO: frame base is not useful unfortunately.. need frame size
        print(func.name + " frame base = " + str(func.frame_base) + ", variables = " + str(func.variables))


In [None]:
for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    with open(debugFilepath, 'rb') as fstream:
        if not fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            continue
        print('Is ELF, checking for debug info..')
        elf = ELFFile(fstream)
        print(elf.get_machine_arch())

In [None]:
from elftools.elf.elffile import ELFFile
from elftools.dwarf.constants import DW_LNS_copy, DW_LNS_set_file, DW_LNE_define_file, \
                                     DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa
from dwarf_import.model.module import Module
from dwarf_import.io.dwarf_import import DWARFDB, DWARFImporter, place_component_in_module_tree

for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    print(f'Opening {debugFilepath} to check magic..')
    with open(debugFilepath, 'rb') as fstream:
        if not fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            continue
        print('Is ELF, checking for debug info..')
        elf = ELFFile(fstream)
        if not elf.has_dwarf_info():
            print('does not contain DWARF info')
            continue
        print('Has debug info, parsing functions..')
        dwarfInfo = elf.get_dwarf_info()
        module, importer = parseDWARF(elf)
        file2funcs[debugFilepath] = getFunctions(module)
        #handleLineprogram(dwarfInfo)
        collectFrameInfo(file2funcs[debugFilepath], elf)


In [None]:
#from elftools.dwarf.descriptions import describe_reg_name

'''should probably be named getMaxFrameSize'''
def getFrameSize(function):
    # The number we get here statically from the .eh_frame section can actually be validated using GDB:
    # ./gdb path/to/prog
    # (gdb) set confirmation off
    # (gdb) break {func.name}
    # (gdb) r
    # (gdb) rbreak .
    # (gdb) c
    # (gdb) info frame
    #  at this point "frame at 0xADDRESS_A" - "called by frame at 0xADDRESS_B" should match our number below
    funcFrameRegs = [(key, val) for d in function.frame for key, val in d.items() if type(key)==int]
    return abs(min(funcFrameRegs, key=lambda t : t[1].arg)[1].arg)

sumFuncs = file2funcs['data/cross-compile-dataset/bin/static/gcc/og/sum']
sumMain = list(filter(lambda func : func.name == 'main', sumFuncs))[0]
print([f'{func.name} => {getFrameSize(func)}' for func in sumFuncs])
#print([describe_reg_name(regno, 'x64') + " => " + str(rule.arg) for (regno, rule) in sumMainFrameRegs])



In [None]:
'''Doesnt appear to be super useful for stack symbolization as compilers dont seem to emit relevant information'''
def handleLineprogram(dwarfInfo):
    print('looping through compilation units..')
    for cu in dwarfInfo.iter_CUs():
        lp = dwarfInfo.line_program_for_CU(cu)
        if lp == None:
            print('DWARF info is missing a line program for this CU')
            print(cu.keys())
            continue
        cu_filename = lp['file_entry'][0].name.decode('latin-1')
        if len(lp['include_directory']) > 0:
            dir_index = lp['file_entry'][0].dir_index
            if dir_index > 0:
                idir = lp['include_directory'][dir_index - 1]
            else:
                idir = b'.'
            cu_filename = '%s/%s' % (idir.decode('latin-1'), cu_filename)
        print(f'CU: {cu_filename}')
        #print(f'File name                            Line number    Starting address')
        for entry in lp.get_entries():
            '''
            if entry.state is None:
                # Special handling for commands that don't set a new state
                if entry.command == DW_LNS_set_file:
                    file_entry = lp['file_entry'][entry.args[0] - 1]
                    if file_entry.dir_index == 0:
                        # current directory
                        print('\n./%s:[++]' % (
                            file_entry.name.decode('latin-1')))
                    else:
                        print('\n%s/%s:' % (
                            lp['include_directory'][file_entry.dir_index - 1].decode('latin-1'),
                            file_entry.name.decode('latin-1')))
                elif entry.command == DW_LNE_define_file:
                    print('%s:' % (
                        lp['include_directory'][entry.args[0].dir_index].decode('latin-1')))
                elif entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                    print('=============================')
                    print(repr(entry))
                    print('=============================')
                else:
                    print('=============================')
                    print('OTHER COMMAND: ' + str(entry))
                    print('=============================')
            elif not entry.state.end_sequence:
                # readelf doesn't print the state after end_sequence
                # instructions. I think it's a bug but to be compatible
                # I don't print them too.
                if lp['version'] < 4:
                    print('%-35s  %11d  %18s' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address)))
                else:
                    print('%-35s  %11d  %18s[%d]' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address),
                        state.op_index))
            if entry.command == DW_LNS_copy:
                # Another readelf oddity...
                print()
            '''
            if entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                print('=============================')
                print(repr(entry))
                print('=============================')
        #print(dir(lp))
        #line_entry_mapping(lp)

def line_entry_mapping(line_program):
    import collections
    filename_map = collections.defaultdict(int)
    lp_entries = line_program.get_entries()
    for lpe in lp_entries:
        if not lpe.state or lpe.state.file == 0:
            continue # TODO: instruction doesn't correspond to src
        filename = lpe_filename(line_program, lpe.state.file)
        filename_map[filename] += 1
    for filename, lpe_count in filename_map.items():
        print('%s -> %d entries' % (filename, lpe_count))

def lpe_filename(line_program, file_index):
    lp_header = line_program.header
    print(lp_header)
    file_entries = lp_header["file_entry"]
    file_entry = file_entries[file_index - 1]
    dir_index = file_entry["dir_index"]
    if dir_index == 0:
        return file_entry.name.decode()
    directory = lp_header["include_directory"][dir_index - 1]
    return os.path.join(directory, file_entry.name).decode()

    

#print(elf.get_section_by_name('.text').data())

In [None]:
from stacksyms import parseDWARF
module, importer = parseDWARF(debugFilepath)
for cu in module.children():
    for component in cu.children():
        for func in component.functions:
            print(func)
    
dwarfDB = importer._dwarf_db
dwarfData = dwarfDB._pri
print(dir(dwarfData))
print(dwarfData._die_map)
# TODO: we could get basic block info from DWARF.. maybe
#       look at "objdump --dwarf=line execFile" and DWARF Spec Section 6.2
# UPDATE: nope, see cell above
#lineProg = dwarfData.get_line_program()
#dir(lineProg)
#types = firstUnit.types
#globals = firstUnit.variables
functions = firstUnit.functions
print([func.name for func in functions])
#scopeQueries = ['info scope ' + func for func in functions]
disasQueries = ['disas /r ' + func.name for func in functions]
gdbOut = staticGDB(debugFilepath, functions, disasQueries)
#scopeResults, disasResults = results[0:len(results)//2], results[len(results)//2:]

In [None]:
# TODO: newCollectLocals(gdbOut, scopeQueries, functions) and newCollectDisas(gdbOut, disasQueries, functions)
#for scope, func in zip(scopeResults, functions):
#    collectLocals(scope, functions[func])
for disas, func in zip(gdbOut, functions):
    func.disas = [tuple(line.strip().split('\\t')) for line in disas[1:-1]]

In [None]:
X, Y, Z = [], [], []
for func in functions:
    print('////////////////////////')
    print(func.name, hex(func.start), func.frame_base)
    for lvar in func.variables:
        print(lvar.name, lvar.type, "(bytesize = %d)"%lvar.type.byte_size)
        for loc in lvar.locations:
            print(hex(loc.begin) + " to " + hex(loc.end) + ": " \
            + str(loc.type)[13:]
            + str(loc.expr))
        print('')
#    X += [func] #generateFeature(func, functions)]
#    Y += [generateLabel(func, functions)]
#    Z += [generateDebugLabel(func, functions)]
#print([x+" => "+ str(y) for x,y in zip(X,Z)])

#func = 'quotearg_n_style_colon'

In [None]:
for func in functions:
    print('////////////////////////')
    print(func.name, func.frame_base)
    for lvar in func.variables:
        print(lvar.name, lvar.type, "(bytesize = %d)"%lvar.type.byte_size)
        for loc in lvar.locations:
    #        print(loc)
            print(hex(loc.begin) + " to " + hex(loc.end) + ": " \
    #            + str(loc.type)[13:]
                + str(loc.expr))
            print('')