In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

from stacksyms import parseELF, parseDWARF, getAllFunctions, collectFrameInfo, \
                      assign_frames, processRegisterRuleExpressions, propagateTypeInfo, \
                      collectOpcodes, collectDisassemblyObjdump, checkLabels

#filepath = '/tmp/binary/x86/gcc-32-O1-utillinux-unshare'
filepath = 'data/cross-compile-dataset/bin/static/gcc/o1/pee'
#filepath = 'data/cross-compile-dataset/bin/static/gcc/og/parallel'
#filedir = 'data/cross-compile-dataset/bin/static/gcc/og'
#filepath = 'data/cross-compile-dataset/bin/dynamic/clang/o3/xrdb'
#filedir = '.'
#filepath = 'data/cross-compile-dataset/bin/static/gcc/og/sum'
#filepath = 'data/spec2006/bzip2'

In [2]:
# In principle, on x86 we can read out the stack size of a function from the
# .eh_frame section as follows:
# 1) "nm elf_binary | grep function_name" -> note address
# 2) "readelf --debug-dump=frames-interp | grep -A10 'address..'"
#    -> read off maximum stack offset from last entry under CFA column
# The problem is that this doesn't work on architectures that do not store the
# return address on the stack and even on x86 some compiler passes may result in
# code that will try to hold the return address in a register, even though it is
# stored on the stack (e.g., "gcc-32-O1-utillinux-unshare")
from elftools.elf.elffile import ELFFile

elf = ELFFile(open(filepath, 'rb'))
module, importer = parseDWARF(elf)
func_dict = getAllFunctions(module, importer, elf)

05/15/2021 15:18:54 INFO:parseDWARF:ELF file says it has some frame info..
05/15/2021 15:18:54 DEBUG:import_components:--pee.c-------------------------------------------------------------------------
05/15/2021 15:18:54 DEBUG:import_subprogram:subprogram main@0x400b46
05/15/2021 15:18:54 DEBUG:import_inlined_subroutine:inlined subroutine fread
05/15/2021 15:18:54 DEBUG:import_inlined_subroutine:inlined subroutine fprintf
05/15/2021 15:18:54 DEBUG:import_inlined_subroutine:inlined subroutine fprintf
05/15/2021 15:18:54 DEBUG:import_subprogram:subprogram close_pipes@0x400aee
05/15/2021 15:18:54 DEBUG:import_global_variable:variable i
05/15/2021 15:18:54 DEBUG:import_global_variable:variable r
05/15/2021 15:18:54 DEBUG:import_global_variable:variable pipes
05/15/2021 15:18:54 DEBUG:import_global_variable:variable buf
05/15/2021 15:18:54 DEBUG:import_global_variable:variable ret
05/15/2021 15:18:54 DEBUG:import_global_variable:variable j
05/15/2021 15:18:54 DEBUG:import_global_variable:var

DW_TAG_formal_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_inlined_subroutine (child tag)
DW_TAG_inlined_subroutine (child tag)
DW_TAG_inlined_subroutine (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_GNU_call_site_parameter (child tag)
DW_TAG_GNU_call_site_parameter (child tag)
DW_TAG_GNU_call_site_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_formal_parameter (child tag)
DW_TAG_GNU_call_site (child tag)
DW_TAG_

In [3]:
frame_tables = collectFrameInfo(func_dict, elf)
func_dict = assign_frames(frame_tables, func_dict)
func_dict = processRegisterRuleExpressions(func_dict, importer)
func_dict = propagateTypeInfo(func_dict, importer)
func_dict = collectOpcodes(func_dict, elf)
#labels = checkLabels(functions)

05/15/2021 15:18:56 INFO:collectFrameInfo:has .eh_frames
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for deregister_tm_clones@0x400a00!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for register_tm_clones@0x400a40!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for __do_global_dtors_aux@0x400a80!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for frame_dummy@0x400ab0!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for __restore_rt@0x451970!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for __writev_nocancel@0x46dee9!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for __open_nocancel@0x441299!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for _init@0x4002e0!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for __read_nocancel@0x4412f9!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame table for _dl_tlsdesc_resolve_rela@0x47b740!
05/15/2021 15:18:57 CRITICAL:assign_frames:No frame tab

In [None]:
# TODO create Function objects
symtab = elf.get_section_by_name('.symtab')
candidate_symbols = []
if symtab is not None:
    print(next(symtab.iter_symbols()).entry)
    for nsym, symbol in enumerate(symtab.iter_symbols()):
        if symbol['st_info']['type']=='STT_FUNC':
            if symbol['st_value'] not in func_dict:
                candidate_symbols += [(symbol['st_value'], symbol.name)]
    print('\n'.join(map(lambda x : hex(x[0])+'\t'+x[1],sorted(candidate_symbols, key=lambda x : x[0]))))

In [None]:
from elftools.dwarf.callframe import CIE, FDE, ZERO
dwarfInfo = elf.get_dwarf_info()
cfi_entries = dwarfInfo.EH_CFI_entries()

#func = next(filter(lambda func : func.name==funcname, functions))
candidate_locations = []
for entry in cfi_entries:
    if isinstance(entry, FDE):
        if entry['initial_location'] not in func_dict:
            candidates += [(entry['initial_location'], entry['initial_location'] + entry['address_range'])]
print('\n'.join(list(map(lambda x : hex(x[0]), sorted(candidates, key=lambda x : x[0])))))

In [None]:
# TODO get disassembly by function
text = elf.get_section_by_name('.text')
start_address = text['sh_addr']
sorted_functions = sorted(functions, key=lambda func : func.start)
for func in sorted_functions:
    if not func.is_inline:
        if start_address <= func.start:
            print(func.name, hex(func.start))

In [None]:
functions = parseELF(filepath)
labels = checkLabels(functions)

In [None]:
import os
import json
from stacksyms import parseELF, checkLabels
dataroot  = "data/cross-compile-dataset/bin/static/gcc/o2"
outputdir = "data/cross-compile-dataset/labels/static/gcc/o2"
f = 'yes'
filepath  = os.path.join(dataroot, f)
functions = parseELF(filepath)
allLabels = checkLabels(functions)
#if not os.path.exists(outputdir):
#    os.makedirs(outputdir, exist_ok=True)
#with open(os.path.join(outputdir, f), 'w') as cf:
#    json.dump(allLabels, cf)

In [None]:
from pygdbmi.gdbcontroller import GdbController
from stacksyms import collectDisassembly, generateDisasFeature

# generate input features for learning
for fname in next(os.walk(debugFiledir))[2]:
    debugFilepath = debugFiledir + os.sep + fname
    with open(debugFilepath, 'rb') as fstream:
        if fstream.peek(1)[:4].hex() == b'\x7fELF'.hex():
            if not os.path.exists(debugFilepath+'.data'):
                functions = parseELF(debugFilepath)
                gdbmi = GdbController()
                functions = collectDisassembly(gdbmi, functions, debugFilepath)
                gdbmi.exit()
                lines = generateDisasFeature(functions)
                with open(debugFilepath+'.data', 'w+') as disasFeatures:
                    print(f'writing disassembly feature to {debugFilepath}.data..')
                    disasFeatures.writelines('\n'.join(lines))
                

In [None]:
functions = parseELF(debugFilepath)

In [None]:
#import logging; logging.getLogger().setLevel(logging.DEBUG)
for func in functions:
    if not func.is_inline:
        frame = getMaxFrameSize(func)
        label = generateDebugLabel(func)
        print(f"{func.name} (frame starts at {func.frame_base}, size is {frame} bytes by offset) => {label}")
        if 0<len(func.inlined_functions):
            print("    inlines ", func.inlined_functions)
            for inlined in func.inlined_functions:
                print("        ", inlined.name, getMaxFrameSize(inlined), generateDebugLabel(inlined))

In [None]:
#PAD_TOK8  = 0xa1 # 1 byte isn't super reliable.. might actually appear in instructions
PAD_TOK16 =  0x0f78 # vmread (0x0f79 is vmwrite, could also use 'UD2' 0x0f0b)
TOK_LEN = 2 # a token is two bytes
PAD_TOK = PAD_TOK16
MAX_INST_LEN = 16 # pretty sure 128 bits is the limit on x86

def tokenize(features):
    return [[int(x[i:i+2*TOK_LEN], 16) for i in range(0,len(x),2*TOK_LEN)] for x in features]

def funcFeatures(function): # features are instructions (16 bytes max)
    return tokenize(map(lambda t : t[1].replace(' ', ''), function.disas))

def generatePositionalEncodings(function): # code addresses
    return list(map(lambda t : t[0][0:18], function.disas))

for func in functions:
    if 0<len(func.disas):
        print()

In [None]:
X, Y, Z = [], [], []
for func in functions:
    print('////////////////////////')
    print(func.name, hex(func.start), func.frame_base)
    for lvar in func.variables:
        print(lvar.name, lvar.type, "(bytesize = %d)"%lvar.type.byte_size)
        for loc in lvar.locations:
            print(hex(loc.begin) + " to " + hex(loc.end) + ": " \
            + str(loc.type)[13:]
            + str(loc.expr))
        print('')
#    X += [func] #generateFeature(func, functions)]
#    Y += [generateLabel(func, functions)]
#    Z += [generateDebugLabel(func, functions)]
#print([x+" => "+ str(y) for x,y in zip(X,Z)])

#func = 'quotearg_n_style_colon'

In [None]:
'''Doesnt appear to be super useful for stack symbolization as compilers dont seem to emit relevant information'''
def handleLineprogram(dwarfInfo):
    print('looping through compilation units..')
    for cu in dwarfInfo.iter_CUs():
        lp = dwarfInfo.line_program_for_CU(cu)
        if lp == None:
            print('DWARF info is missing a line program for this CU')
            print(cu.keys())
            continue
        cu_filename = lp['file_entry'][0].name.decode('latin-1')
        if len(lp['include_directory']) > 0:
            dir_index = lp['file_entry'][0].dir_index
            if dir_index > 0:
                idir = lp['include_directory'][dir_index - 1]
            else:
                idir = b'.'
            cu_filename = '%s/%s' % (idir.decode('latin-1'), cu_filename)
        print(f'CU: {cu_filename}')
        #print(f'File name                            Line number    Starting address')
        for entry in lp.get_entries():
            '''
            if entry.state is None:
                # Special handling for commands that don't set a new state
                if entry.command == DW_LNS_set_file:
                    file_entry = lp['file_entry'][entry.args[0] - 1]
                    if file_entry.dir_index == 0:
                        # current directory
                        print('\n./%s:[++]' % (
                            file_entry.name.decode('latin-1')))
                    else:
                        print('\n%s/%s:' % (
                            lp['include_directory'][file_entry.dir_index - 1].decode('latin-1'),
                            file_entry.name.decode('latin-1')))
                elif entry.command == DW_LNE_define_file:
                    print('%s:' % (
                        lp['include_directory'][entry.args[0].dir_index].decode('latin-1')))
                elif entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                    print('=============================')
                    print(repr(entry))
                    print('=============================')
                else:
                    print('=============================')
                    print('OTHER COMMAND: ' + str(entry))
                    print('=============================')
            elif not entry.state.end_sequence:
                # readelf doesn't print the state after end_sequence
                # instructions. I think it's a bug but to be compatible
                # I don't print them too.
                if lp['version'] < 4:
                    print('%-35s  %11d  %18s' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address)))
                else:
                    print('%-35s  %11d  %18s[%d]' % (
                        lp['file_entry'][state.file - 1].name.decode('latin-1'),
                        state.line,
                        '0' if state.address == 0 else
                            hex(state.address),
                        state.op_index))
            if entry.command == DW_LNS_copy:
                # Another readelf oddity...
                print()
            '''
            if entry.command in [DW_LNS_set_basic_block, DW_LNS_set_prologue_end, DW_LNS_set_isa]:
                print('=============================')
                print(repr(entry))
                print('=============================')
        #print(dir(lp))
        #line_entry_mapping(lp)

def line_entry_mapping(line_program):
    import collections
    filename_map = collections.defaultdict(int)
    lp_entries = line_program.get_entries()
    for lpe in lp_entries:
        if not lpe.state or lpe.state.file == 0:
            continue # TODO: instruction doesn't correspond to src
        filename = lpe_filename(line_program, lpe.state.file)
        filename_map[filename] += 1
    for filename, lpe_count in filename_map.items():
        print('%s -> %d entries' % (filename, lpe_count))

def lpe_filename(line_program, file_index):
    lp_header = line_program.header
    print(lp_header)
    file_entries = lp_header["file_entry"]
    file_entry = file_entries[file_index - 1]
    dir_index = file_entry["dir_index"]
    if dir_index == 0:
        return file_entry.name.decode()
    directory = lp_header["include_directory"][dir_index - 1]
    return os.path.join(directory, file_entry.name).decode()

    

#print(elf.get_section_by_name('.text').data())