In [1]:
import tarfile
import os
import sys
import pickle
import tensorflow as tf
from datetime import datetime

In [2]:
def get_all_tar_filenames(tar_file_dir):
    files = os.listdir(tar_file_dir)
    tar_files = list()
    for f in files:
        if f.endswith(".tar.bz2"):
            tar_files.append(f)
    
    return tar_files

In [3]:
def untar_one_pickle_file(full_path_tar_file, work_dir):
    tar = tarfile.open(full_path_tar_file, "r:bz2")  
    tar.extractall(work_dir)
    tar.close()
    
    


In [4]:
def get_pickle_file_content(full_path_pickle_file):
    pickle_file = open(full_path_pickle_file,'rb')
    pickle_list = pickle.load(pickle_file, encoding='latin1')
    pickle_file.close()
    
    return pickle_list

In [5]:
def print_one_pickle_list_item(pickle_file_content):
    item = next(iter(pickle_file_content))
    if item:
        print(f'function-signature: {item[0]}')
        print(f'gdb-ptype: {item[1]}')
        print(f'function-name: {item[2]}')
        print(f'function-file-name: {item[3]}')
        print(f'disassembly-att: {item[4]}')
        print(f'disassembly-intel: {item[5]}')
        print(f'package-name: {item[6]}')
        print(f'binary-name: {item[7]}')
    else:
        print('Error item[0]')

In [6]:
def get_string_before_function_name(function_signature):
    return_type = ''
    
    ### find ( which marks the function-names end
    fn_end_idx = function_signature.index('(')
    
    ### now step one char left, till * , &, or ' ' is found
    c = -1
    for char in function_signature[fn_end_idx::-1]:
        if char == '*' or char == ' ' or char == '&':
            #print(f'return-type: {function_signature[:fn_end_idx-c]}')
            return_type = function_signature[:fn_end_idx-c].strip()
            break
        c += 1
                  
    return return_type

In [7]:
def get_raw_return_type_from_gdb_ptype(gdb_ptype):
    
    return_type_list = ['bool', 'bool *', 'const bool',
                        'void', 'void *', 'void **', 'void (*)(void *)', 'void * const',
                        'char', 'char *', 'unsigned char *', 'char **', 'const char *', 'signed char',
                        'const char **', 'unsigned char', 'const char', 'const unsigned char *',
                        'unsigned char **', 'const char * const *', 'char32_t',
                        'signed char *', 'wchar_t *', 'const char16_t *',
                        'unsigned short', 'short', 'unsigned short *', 'short *',
                        'const unsigned short *', 'unsigned short **', 'short **',
                        'int', 'int *', 'unsigned int', 'const int *', 'const unsigned int *',
                        'int **', 'unsigned int **', 'volatile int *',
                        'unsigned int *', 'const unsigned int', 'const int',
                        'long','unsigned long', 'unsigned long long', 'unsigned long *', 'long long',
                        'const unsigned long', 'unsigned long **', 'const long', 'const long *',
                        'long *', 'const unsigned long long *', 'const unsigned long *',
                        'long long *',
                        'double', 'const double *', 'double *', 'const double', 'long double',
                        'double **',
                        'float', 'const float *', 'float *', 'const float',
                        'float **']
    
    if "type =" in gdb_ptype:
        ### pattern based
        new_gdb_ptype = gdb_ptype.replace('type =', '')
        raw_gdb_ptype = new_gdb_ptype.strip()
        
        ### check if we directly find a valid return type
        for return_type in return_type_list:
            if raw_gdb_ptype == return_type:
                return return_type
            elif raw_gdb_ptype == '_Bool':
                return 'bool'
            elif raw_gdb_ptype == '_Bool *':
                return 'bool *'
            elif raw_gdb_ptype == 'ulong':
                return 'unsigned long'
            elif raw_gdb_ptype == 'uint':
                return 'unsigned int'
            elif raw_gdb_ptype == 'ubyte':
                return 'unsigned char'
            elif raw_gdb_ptype == 'ubyte *':
                return 'unsigned char *'

            
        ### check if { is there
        idx = 0
        if '{' in raw_gdb_ptype:
            idx = raw_gdb_ptype.index('{')
            
        if idx > 0:
            #print(f'Found braket-sign')
            front_str = raw_gdb_ptype[:idx]
            front_str = front_str.strip()
            #print(f'front_str: {front_str}')
            if 'class' in front_str:
                ### check if ptype got {} signs for class
                if '}' in front_str:
                    ### check if * or ** is after } available
                    idx = front_str.rfind('}')
                    last_front_str = front_str[idx:]
                
                    star_count = last_front_str.count('*')
                    if star_count == 0:
                        return 'class'
                    elif star_count == 1:
                        return 'class *'
                    elif star_count == 2:
                        return 'class **'
                    elif 'std::' in front_str:
                        return 'delete'
                    else:
                        print(f'Error star_count class >{star_count}< front_str >{front_str}<')
                        return 'unknown'
                    
            elif 'struct' in front_str:
                star_count = front_str.count('*')
                if star_count == 0:
                    return 'struct'
                elif 'std::' in front_str:
                    return 'delete'
                elif 'QPair' in front_str:
                    return 'delete'
                else:
                    print(f'Error star_count struct >{star_count}< front_str >{front_str}<')
                    return 'unknown'
            elif 'enum' in front_str:
                star_count = front_str.count('*')
                if star_count == 0:
                    return 'enum'
                else:
                    print(f'Error star_count enum >{star_count}< front_str >{front_str}<')
                    return 'unknown'
            elif 'union' in front_str:
                #print(f'front_str-union: {front_str}')
                star_count = front_str.count('*')
                if star_count == 0:
                    return 'union'
                else:
                    print(f'Error star_count union >{star_count}< front_str >{front_str}<')
                    return 'unknown'
                
            else:
                print(f'---Nothing found')
                print(f'front_str: {front_str}')
                return 'unknown'
            
        elif (raw_gdb_ptype.count('(') == 2) and (raw_gdb_ptype.count(')') == 2):
            print(f'Found func-pointer as return-type, delete till now')
            return 'delete'
        elif 'substitution' in raw_gdb_ptype:
            print(f'Found substituion-string, dont know, delete it')
            return 'delete'
        else:
            print(f'------no gdb ptype-match for: >{raw_gdb_ptype}<')
            return 'unknown'
    else:
        print(f'No gdb ptype found')
        return 'unknown'

In [8]:
def get_function_return_type(string_before_func_name, gdb_ptype):
    ### get raw return type, e.g. "void" or "struct" instead of "struct timeval" from gdb-ptype
    raw_gdb_return_type = get_raw_return_type_from_gdb_ptype(gdb_ptype)
    
    if raw_gdb_return_type == 'unknown':
        print(f'string_before_func_name: {string_before_func_name}')
    
    return raw_gdb_return_type


In [9]:
def store_as_tfrecord():
    tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


In [10]:
def clean_att_disassembly(att_disassembly):
    cleaned = list()
    
    for dis_line in att_disassembly:
        #if 'ako' in dis_line:
        #    print(f'dis_line:{dis_line}')
        dis_line_parts = dis_line.split('\t')
        if len(dis_line_parts) != 2:
            print(f'len:{len(dis_line_parts)}')
        #if 'ako' in dis_line:
        #   print(f'dis_line_parts:{dis_line_parts}')
            
        dis_line_front = dis_line_parts[1]
            
        
        if '#' in dis_line_front:
            ### get index of #
            idx = dis_line_front.index('#')
            if '<' in dis_line_front:
                idx2 = dis_line_front.index('<')
                if idx2 < idx:
                    idx = idx2
            
            ### copy part infront of #
            clean_dis = dis_line_front[:idx-1]
            ### strip whitelines from copying
            clean_dis = clean_dis.strip()
            
            if 'ako' in clean_dis:
                print(f"Error here---1---------{clean_dis}")
            cleaned.append(clean_dis)
        elif '<' in dis_line_front:
            #print(f'dis_line:{dis_line_front}')
            ### get index of <
            idx = dis_line_front.index('<')
            if '#' in dis_line_front:
                idx2 = dis_line_front.index('#')
                if idx2 < idx:
                    idx = idx2
            #print(f'idx:{idx}')
            ### copy part infront of <
            clean_dis = dis_line_front[:idx-1]
            #print(f'clean_dis: {clean_dis}')
            ### strip whitelines from copying
            clean_dis = clean_dis.strip()
            
            if 'ako' in clean_dis:
                print("Error here---2---------")
            cleaned.append(clean_dis)
        else:
            if 'ako' in dis_line_front:
                print("Error herer---3------")
            cleaned.append(dis_line_front)
            
        
    for i in cleaned:
        if 'ako' in i:
            print(f'Error')
        
    return cleaned

# put whitespace between () [] and so on to build own "words"

In [11]:
def build_bag_of_words_style_assembly(cleaned_att_disassembly):
    bag_style_att_disassembly = list()
    
    for line in cleaned_att_disassembly:
        line = line.replace('(', ' ( ')
        line = line.replace(')', ' ) ')
        line = line.replace('%', ' % ')
        line = line.replace(',', ' , ')
        line = line.replace('$', ' $ ')
        line = line.replace('*', ' * ')
        for item in line.split():
            ### replace every addr e.g. 0x764 with 0x, to have a better vocab
            if '0x' in item:
                bag_style_att_disassembly.append('0x')
            else:
                bag_style_att_disassembly.append(item)
        
    return bag_style_att_disassembly

In [12]:
def save_new_pickle(full_path_pickle_file, pickle_content):
    pickle_file = open(full_path_pickle_file,'wb+')
    pickle.dump(pickle_content, pickle_file)
    pickle_file.close()

In [13]:
def get_all_bag_styled_pickle_files(save_dir):
    files = os.listdir(save_dir)
    bag_files = list()
    for f in files:
        if f.endswith(".pickle"):
            bag_files.append(f)
    
    return bag_files

In [14]:
#### main

start=datetime.now()

#tar_file_dir = "/home/ubu/jupyter-notebooks/build-tf-ds-from-pickle"
tar_file_dir = "/tmp/testtars"   ##dir where .pickle.tar.bz2 files are
work_dir = "/tmp/test"           ##dir where untar files to
save_dir = "/tmp/savetest"       ##dir where we save bag-style pickle files to

unique_return_types = set()

disassembly_att_and_ret_types_list = list()

###get a list with all pickle.tar.bz2 files
all_tar_files = get_all_tar_filenames(tar_file_dir)
#print(all_tar_files)
all_bag_styled_files = get_all_bag_styled_pickle_files(save_dir)

breaker = False

##test
##all_tar_files = ['kakoune.pickle.tar.bz2']
##all_bag_styled_files = []

### loop through all pickle.tar.bz2 files and untar them
for one_tar_file in all_tar_files:
    cont = False
    
    for bag_file in all_bag_styled_files:
        #print(f'bag-file:{bag_file}')
        if bag_file.replace('att-', '') == one_tar_file.replace(".tar.bz2", ""):
            #print('Already bag-styled this file')
            cont = True
            break
    
    if(cont):
        continue
    
    untar_one_pickle_file(tar_file_dir + "/" + one_tar_file, work_dir)

    ### read out pickle file
    pickle_file_content = get_pickle_file_content(work_dir + "/" + one_tar_file.replace(".tar.bz2", ""))

    ##for debug
    #print_one_pickle_list_item(pickle_file_content)

    disassembly_att_and_ret_types_list.clear()
    
    ### loop through all pickle items and 
    for item in pickle_file_content:
        if item:
            ### if the item[0] exists
            ### item[4] => att
            ### item[5] => intel
            if item[0].strip() and item[1].strip() and (len(item[4]) > 1) and (len(item[5]) > 1):
                
                string_before_func_name = get_string_before_function_name(item[0])
                #print(f'string_before_func_name: >{string_before_func_name}<')
                return_type = get_function_return_type(string_before_func_name, item[1])
                
                #print(f'return_type: >{return_type}<')
                if return_type == 'unknown':
                    print('unknown found')
                    breaker = True
                    break
                elif return_type == 'delete':
                    print('delete found')
                    ### no return type found, so delete this item
                    #pass
                else:
                    unique_return_types.add(return_type)
                    ### remove addr and stuff
                    cleaned_att_disassembly = clean_att_disassembly(item[4])
                    bag_of_words_style_assembly = build_bag_of_words_style_assembly(cleaned_att_disassembly)
                    
                    ### append it to the last list, which gets stored to tfrecord
                    disassembly_att_and_ret_types_list.append((bag_of_words_style_assembly, return_type))
        else:
            print("-----No item in pickle file")
       
    if breaker:
        break
        
    ### save to new pickle file, to save_dir
    print(f'Save file: att-{one_tar_file.replace(".tar.bz2", "")}')
    save_new_pickle(save_dir + '/' + 'att-' + one_tar_file.replace(".tar.bz2", ""), 
                    disassembly_att_and_ret_types_list)
     
    
    
    

stop=datetime.now()
print(f'Run took:{stop-start} Hours:Min:Sec')

###print 3 list items
c=0
for d, r in disassembly_att_and_ret_types_list:
    print(f'bag-style-disas:{d} \n return-type:{r}')
    print('-----------')
    c += 1
    if c > 3:
        break
    ### can a function end with callq ???????
    ### remove numbers 0x and $0x, etc.
        
        


print(f'unique_return_types: {unique_return_types}')
print(f'len disassembly_att_and_ret_types_list: {len(disassembly_att_and_ret_types_list)}')
sz = sys.getsizeof(disassembly_att_and_ret_types_list)
print(f'size of: {sz}')




### if less than 100Mb, store it in tfrecord file (github doesnt allow > 100Mb files)
#store_as_tfrecord(disassembly_att_and_ret_types_list)

Save file: att-gnome-panel-control.pickle
Save file: att-kanshi.pickle
Save file: att-clamdscan.pickle
Save file: att-strace.pickle
Save file: att-booth.pickle
Save file: att-baycomepp.pickle
Save file: att-dmraid.pickle
Save file: att-udev.pickle
Save file: att-bximage.pickle
Save file: att-avahi-autoipd.pickle
Save file: att-otb-bin-qt.pickle
Save file: att-ogdi-bin.pickle
Save file: att-bittwist.pickle
Save file: att-ksshaskpass.pickle
Save file: att-bfs.pickle
Save file: att-kwave.pickle
Save file: att-dpkg.pickle
Save file: att-totem.pickle
Save file: att-xsltproc.pickle
Save file: att-pango1.0-tools.pickle
Save file: att-squid.pickle
Save file: att-kanagram.pickle
Save file: att-irda-utils.pickle
Save file: att-numactl.pickle
Save file: att-kpartloader.pickle
Save file: att-kamailio-berkeley-bin.pickle
Save file: att-kopete.pickle
delete found
Save file: att-osmium-tool.pickle
Save file: att-lockfile-progs.pickle
Save file: att-seahorse.pickle
Save file: att-krename.pickle
Save f

Save file: att-net-tools.pickle
Save file: att-otb-qgis.pickle
Save file: att-gpg-wks-server.pickle
Save file: att-bwbasic.pickle
Save file: att-obexftp.pickle
Save file: att-joyent-mdata-client.pickle
Save file: att-fuse.pickle
Save file: att-kdebugsettings.pickle
Found func-pointer as return-type, delete till now
delete found
Save file: att-network-manager.pickle
Save file: att-cups-browsed.pickle
Save file: att-ntfs-3g-dev.pickle
Save file: att-oar-common.pickle
Save file: att-konversation.pickle
Save file: att-bppsuite.pickle
Save file: att-konqueror.pickle
Save file: att-avahi-utils.pickle
Save file: att-nfs-common.pickle
Save file: att-onedrive.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Save file: att-xserver-xephyr.pickle
Save file: att-kiten.pickle
Save file: att-bogofilter-bdb.pickle
Save file: att-ccache.pickle
Save file: 

Save file: att-crash.pickle
Save file: att-beef.pickle
Save file: att-freeradius-utils.pickle
Save file: att-gnome-terminal.pickle
Save file: att-bind9-dyndb-ldap.pickle
Save file: att-baloo-kf5.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Save file: att-bwctl-server.pickle
Save file: att-sysfsutils.pickle
Save file: att-otb-bin.pickle
Save file: att-openscenegraph.pickle
Save file: att-iputils-ping.pickle
Save file: att-pciutils.pickle
Save file: att-sqlite3.pickle
Save file: att-brasero.pickle
Save file: att-pulseaudio-utils.pickle
Save file: att-owx.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Save file: att-xwayland.pickle
Save file: att-babeld.pickle
Save file: att-powertop.pickle
Save file: att-wdiff.pickle
Save file: att-gjs.pickle
Save fi

Save file: att-rtkit.pickle
Save file: att-sane-utils.pickle
Save file: att-printer-driver-gutenprint.pickle
Save file: att-ow-shell.pickle
Save file: att-biff.pickle
Save file: att-guile-2.2-libs.pickle
Save file: att-accountwizard.pickle
Save file: att-kluppe.pickle
Save file: att-olpc-kbdshim.pickle
Save file: att-wpasupplicant.pickle
Found func-pointer as return-type, delete till now
delete found
Save file: att-barnowl.pickle
Save file: att-xfsdump.pickle
Save file: att-kwalletmanager.pickle
Save file: att-git.pickle
Save file: att-nginx-core.pickle
Save file: att-osmocom-bs11-utils.pickle
Save file: att-k4dirstat.pickle
Save file: att-bcal.pickle
Save file: att-badger.pickle
Save file: att-boinc-manager.pickle
Save file: att-osdlyrics.pickle
Save file: att-xserver-xorg-video-intel.pickle
Save file: att-ocrad.pickle
Save file: att-kgeography.pickle
Save file: att-bibshelf.pickle
Save file: att-finger.pickle
Save file: att-efivar.pickle
Found func-pointer as return-type, delete till

Save file: att-bamtools.pickle
Save file: att-bacula-tray-monitor.pickle
Found func-pointer as return-type, delete till now
delete found
Save file: att-dirmngr.pickle
Save file: att-kickpass.pickle
Save file: att-iscsiuio.pickle
Save file: att-kamoso.pickle
Save file: att-tinycdb.pickle
Save file: att-vim.pickle
Save file: att-keytouch-editor.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Save file: att-kolourpaint.pickle
Save file: att-systemd-container.pickle
Save file: att-rsyslog.pickle
Save file: att-bitseq.pickle
Save file: att-auditd.pickle
Save file: att-hfsutils.pickle
Save file: att-mdadm.pickle
Save file: att-sasl2-bin.pickle
Save file: att-keynav.pickle
Save file: att-quagga-isisd.pickle
Save file: 

Save file: att-bro-aux.pickle
Save file: att-inputattach.pickle
Save file: att-makehrtf.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Save file: att-openssh-client-ssh1.pickle
Save file: att-korganizer.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete foun

Found func-pointer as return-type, delete till now
delete found
Save file: att-reiser4progs.pickle
Save file: att-bossa-cli.pickle
Save file: att-mokutil.pickle
Save file: att-kde-telepathy-send-file.pickle
Save file: att-bluedevil.pickle
Save file: att-biboumi.pickle
Save file: att-bidiv.pickle
Save file: att-kaffeine.pickle
Save file: att-chrony.pickle
Save file: att-openmpi-bin.pickle
Save file: att-bsdiff.pickle
Save file: att-kakasi.pickle
Save file: att-xserver-xorg-core.pickle
Save file: att-beep.pickle
Save file: att-krdc.pickle
Save file: att-winbind.pickle
Save file: att-shared-mime-info.pickle
Save file: att-gnome-startup-applications.pickle
Save file: att-hunspell-tools.pickle
Save file: att-transmission-gtk.pickle
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found func-pointer as return-type, delete till now
delete found
Found f

Save file: att-kvirc.pickle
Save file: att-procps.pickle
Save file: att-kcharselect.pickle
Save file: att-kxstitch.pickle
Save file: att-xmlsec1.pickle
Save file: att-wireless-tools.pickle
Save file: att-bluez-btsco.pickle
Save file: att-kopano-ical.pickle
Save file: att-spice-vdagent.pickle
Save file: att-btscanner.pickle
Save file: att-gobject-introspection.pickle
Save file: att-btag.pickle
bag-style-disas:['endbr64', 'lea', '0x', '(', '%', 'rdi', ')', ',', '%', 'rax', 'mov', '%', 'rsi', ',', '(', '%', 'rdi', ')', 'mov', '%', 'rax', ',', '0x', '(', '%', 'rdi', ')', 'lea', '0x', '(', '%', 'rdi', ')', ',', '%', 'rax', 'mov', '%', 'rax', ',', '0x', '(', '%', 'rdi', ')', 'lea', '0x', '(', '%', 'rdi', ')', ',', '%', 'rax', 'mov', '%', 'rdx', ',', '0x', '(', '%', 'rdi', ')', 'mov', '%', 'rcx', ',', '0x', '(', '%', 'rdi', ')', 'movq', '$', '0x', ',', '0x', '(', '%', 'rdi', ')', 'movl', '$', '0x', ',', '0x', '(', '%', 'rdi', ')', 'movq', '$', '0x', ',', '0x', '(', '%', 'rdi', ')', 'movl', '$