# Extract all possibile instructions and special register of 64 bit architecture

In [173]:
import json
import os

In [174]:
DATASET_PATH = './dataset/original/DUP/dataset.json'
FOLDER_KEYWORD_ASM_ANALYSIS = './keyword_asm_analysis'

In [175]:
dataset_file = open(DATASET_PATH, 'r')
dataset_file_lines = dataset_file.readlines()

json_files = []
for line in dataset_file_lines:
    json_files.append(json.loads(line.strip()))

In [176]:
#can use json.loads() because string contains ' instead " , so this method is more efficient
def lista_asm_to_list(lista_asm):
    '''
        convert lista_asm in a list in which each element is line (instruction)
        @lista_asm = asm from json file
        @return list of instruction
    '''
    result = []
    raw_list = lista_asm[1:-1].strip().split("',") # used "'," instead of a single comma to avoid split of a instrcution like: "mov rbp, rsp"
    for instr in raw_list:
        result.append(instr.strip()[1:])
    result[-1]=result[-1][:-1] #last instruction contains " ' " at the end
    return result

In [177]:
asm_list = lista_asm_to_list(json_files[0]['lista_asm'])

In [178]:
asm_list

['jmp qword ptr [rip + 0x220882]',
 'jmp qword ptr [rip + 0x220832]',
 'jmp qword ptr [rip + 0x220822]',
 'push rbp',
 'mov rbp, rsp',
 'mov dword ptr [rbp - 4], edi',
 'cmp dword ptr [rbp - 4], 8',
 'jge 0x1d',
 'push rbp',
 'mov rbp, rsp',
 'sub rsp, 0x20',
 'mov qword ptr [rbp - 0x10], rdi',
 'cmp qword ptr [rbp - 0x10], 0',
 'je 0x34',
 'mov rax, qword ptr [rbp - 0x10]',
 'cmp dword ptr [rax + 4], 0',
 'jl 0x1d',
 'mov rax, qword ptr [rbp - 0x10]',
 'cmp qword ptr [rax + 8], 0',
 'jne 0x1c',
 'mov qword ptr [rbp - 8], 0',
 'jmp 0x108',
 'mov eax, 0x10',
 'mov edi, eax',
 'call 0xfffffffffffec70f',
 'mov qword ptr [rbp - 0x18], rax',
 'cmp qword ptr [rbp - 0x18], 0',
 'jne 0x1c',
 'mov qword ptr [rbp - 8], 0',
 'jmp 0xe0',
 'mov rax, qword ptr [rbp - 0x10]',
 'mov ecx, dword ptr [rax + 4]',
 'mov dword ptr [rbp - 0x1c], ecx',
 'mov ecx, dword ptr [rbp - 0x1c]',
 'add ecx, 1',
 'mov edi, ecx',
 'call 0xfffffffffffff6b7',
 'mov dword ptr [rbp - 0x20], eax',
 'movsxd rdi, dword ptr [rb

In [179]:
def asm_decomposed_list(lista_asm):
    '''
        convert lista_asm to a list containing a dict for each instruction where key are: instruction, S, D
        if an instruction doesn't have S or D, the value is empty = ''

        @lista_asm = asm from json file
        @return list of instruction
    '''
    result = []
    raw_list = lista_asm[1:-2].strip().split("',") # used "'," instead of a single comma to avoid split of a instrcution like: "mov rbp, rsp"

    for instr in raw_list:
        line = instr.strip()[1:] #remove "'" at the beginning

        S=''
        D=''
        decomposed = line.split(" ", 1) #INSTRUCTION - S,D
        instr = decomposed[0].strip()
        if(len(decomposed)>1): #search S and D
            s_d = decomposed[1].split(',')
            S = s_d[0].strip()
            if(len(s_d) == 2):
                D = s_d[1].strip()
        dic = {}
        dic['instruction']=instr
        dic['S'] = S 
        dic['D'] = D
        result.append(dic)

    #result[-1]=result[-1][:-1] #last instruction contains " ' " at the end


    return result

In [180]:
liss = asm_decomposed_list(json_files[0]['lista_asm'])
liss

[{'instruction': 'jmp', 'S': 'qword ptr [rip + 0x220882]', 'D': ''},
 {'instruction': 'jmp', 'S': 'qword ptr [rip + 0x220832]', 'D': ''},
 {'instruction': 'jmp', 'S': 'qword ptr [rip + 0x220822]', 'D': ''},
 {'instruction': 'push', 'S': 'rbp', 'D': ''},
 {'instruction': 'mov', 'S': 'rbp', 'D': 'rsp'},
 {'instruction': 'mov', 'S': 'dword ptr [rbp - 4]', 'D': 'edi'},
 {'instruction': 'cmp', 'S': 'dword ptr [rbp - 4]', 'D': '8'},
 {'instruction': 'jge', 'S': '0x1d', 'D': ''},
 {'instruction': 'push', 'S': 'rbp', 'D': ''},
 {'instruction': 'mov', 'S': 'rbp', 'D': 'rsp'},
 {'instruction': 'sub', 'S': 'rsp', 'D': '0x20'},
 {'instruction': 'mov', 'S': 'qword ptr [rbp - 0x10]', 'D': 'rdi'},
 {'instruction': 'cmp', 'S': 'qword ptr [rbp - 0x10]', 'D': '0'},
 {'instruction': 'je', 'S': '0x34', 'D': ''},
 {'instruction': 'mov', 'S': 'rax', 'D': 'qword ptr [rbp - 0x10]'},
 {'instruction': 'cmp', 'S': 'dword ptr [rax + 4]', 'D': '0'},
 {'instruction': 'jl', 'S': '0x1d', 'D': ''},
 {'instruction': 'm

In [181]:
#find special instruction and special register
all_instr = []
all_reg = []

for file in json_files:
    asm = asm_decomposed_list(file['lista_asm'])
    for line in asm:
        if not line['instruction'] in all_instr:
            all_instr.append(line['instruction'])

        S = line['S']
        if not (S == '') and (not ' ' in S) and (not '0x' in S):
            if not S in all_reg:
                all_reg.append(S)


        D = line['D']
        if  not(D == '') and (not ' ' in D) and (not '0x' in D):
            if not D in all_reg:
                all_reg.append(D)


In [182]:
all_instr

['jmp',
 'push',
 'mov',
 'cmp',
 'jge',
 'sub',
 'je',
 'jl',
 'jne',
 'call',
 'add',
 'movsxd',
 'pop',
 'ret',
 'ucomisd',
 'jp',
 'pxor',
 'jae',
 'jb',
 'test',
 'movapd',
 'movsd',
 'divsd',
 'addsd',
 'xorpd',
 'xor',
 'nop',
 'lea',
 'sar',
 'shr',
 'and',
 'neg',
 'cdqe',
 'movzx',
 'shl',
 'or',
 'inc',
 'cmovne',
 'jle',
 'dec',
 'bswap',
 'ror',
 'rol',
 'xorps',
 'movaps',
 'movabs',
 'movq',
 'punpcklqdq',
 'pand',
 'movdqa',
 'movsx',
 'paddq',
 'psrlq',
 'por',
 'pshufd',
 'movd',
 'cmovl',
 'jbe',
 'cmove',
 'jg',
 'setne',
 'cmovle',
 'cmovge',
 'setl',
 'sete',
 'ja',
 'js',
 'rep',
 'jns',
 'jnp',
 'leave',
 'not',
 'imul',
 'cmovg',
 'cmovb',
 'setle',
 'setg',
 'pcmpeqd',
 'bt',
 'subsd',
 'mulsd',
 'andpd',
 'cdq',
 'idiv',
 'pinsrw',
 'punpcklbw',
 'punpcklwd',
 'pslld',
 'movdqu',
 'cmovns',
 'shrd',
 'cmovbe',
 'cmova',
 'cvtsi2sd',
 'cvttsd2si',
 'movlpd',
 'mul',
 'setnp',
 'punpckldq',
 'adc',
 'stc',
 'out',
 'scasb',
 'in',
 'sbb',
 'rcl',
 'fiadd',
 'fc

In [183]:
all_reg

['rbp',
 'rsp',
 'edi',
 '8',
 'rdi',
 '0',
 'rax',
 'eax',
 'ecx',
 '1',
 '5',
 'rcx',
 'rsi',
 'rdx',
 'xmm2',
 'xmm0',
 'xmm1',
 'xmm3',
 'esi',
 'rbx',
 'ebp',
 'ebx',
 'r8d',
 'r10d',
 'r9d',
 'cl',
 '3',
 'al',
 '7',
 'r10',
 'r11d',
 'dil',
 '4',
 '6',
 'r11b',
 '2',
 'bl',
 '9',
 'edx',
 'dl',
 'r14',
 'r15',
 'r13',
 'r12',
 'r14d',
 'r15d',
 'r12d',
 'r13d',
 'ch',
 'r8',
 'r9',
 'xmm4',
 'xmm5',
 'bpl',
 '-6',
 'dh',
 'ah',
 'r13b',
 '[rdx*4]',
 'r12b',
 'sil',
 '-8',
 'xmm6',
 '-1',
 'bh',
 'r10b',
 'r8b',
 'r11',
 'r15b',
 'r9b',
 '[r9*8]',
 '[r10*8]',
 '[rdi*8]',
 '[rcx*8]',
 '[rax*8]',
 '[r11*8]',
 '-4',
 '[rsp]',
 '[rbx*4]',
 '-2',
 '[r8*4]',
 '[rbp*4]',
 '[r13*4]',
 'r14b',
 '[r15*8]',
 'xmm7',
 'dx',
 'r9w',
 'r10w',
 'cx',
 'di',
 'ax',
 'bp',
 'r8w',
 'bx',
 'xmm14',
 'r11w',
 'si',
 '[r12*4]',
 '[r15*4]',
 '[rax*4]',
 'xmm8',
 'xmm10',
 'xmm11',
 'xmm9',
 'xmm15',
 'xmm12',
 'xmm13',
 '[r14*4]',
 'r12w',
 'r15w',
 'r14w',
 'r13w',
 '[r13*8]',
 '[rbx*8]',
 '[r11*4]'

## Find special instruction

In [227]:
known_registers =['rax', 'rcx', 'rdx', 'rbx', 'rsi', 'rdi', 'rsp', 'rbp', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'eax', 'ecx', 'edx', 'ebx', 'esi', 'edi', 'esp', 'ebp', 'r8d', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d', 'ax', 'cx', 'dx', 'bx', 'si', 'di', 'sp', 'bp', 'r8w', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w', 'al', 'cl', 'dl', 'bl', 'sil', 'dil', 'spl', 'bpl', 'r8b', 'r9b', 'r10b', 'r11b', 'r12b', 'r13b', 'r14b', 'r15b']

known_instructions = ['call', 'ret', 'leave', 'mov', 'push', 'pop', 'cwtl', 'cltq', 'cqto', 'inc', 'dec', 'neg', 'not', 'leaq', 'add', 'sub', 'imul', 'xor', 'or', 'and', 'sal', 'shl', 'sar', 'shr', 'imulq', 'mulq', 'idivq', 'divq', 'cmp', 'test', 'sete', 'setz', 'setne', 'setnz', 'sets', 'setns', 'setg', 'setnle', 'setge', 'setnl', 'setl', 'setnge', 'setle', 'setng', 'seta', 'setnbe', 'setae', 'setnb', 'setb', 'setnae', 'setbe', 'setna', 'jmp', 'je', 'jz', 'jne', 'jnz', 'js', 'jns', 'jg', 'jnle', 'jge', 'jnl', 'jl', 'jnge', 'jle', 'jng', 'ja', 'jnbe', 'jae', 'jnb', 'jb', 'jnae', 'jbe', 'jna', 'cmove', 'cmovz', 'cmovne', 'cmovnz', 'cmovs', 'cmovns', 'cmovg', 'cmovnle', 'cmovge', 'cmovnl', 'cmovl', 'cmovnge', 'cmovle', 'cmovng', 'cmova', 'cmovnbe', 'cmovae', 'cmovnb', 'cmovb', 'cmovnae', 'cmovbe', 'cmovna']

In [228]:
def remove_instr(instructions, instr_to_remove):
    new_list = []
    for ins in instructions:
        if not instr_to_remove in ins:
            new_list.append(ins)
    return new_list

In [232]:
unknown_instr = []
for inst in all_instr:
    if not inst in known_instructions:
        if not inst in unknown_instr:
            unknown_instr.append(' ' + inst)

len(unknown_instr)#, unknown_instr

129

In [231]:
new_list =unknown_instr
for instruction in known_instructions:
    to_remove = ' '+instruction
    new_list = remove_instr(new_list, to_remove)
len(new_list), new_list

(99,
 [' ucomisd',
  ' jp',
  ' pxor',
  ' divsd',
  ' nop',
  ' lea',
  ' cdqe',
  ' bswap',
  ' ror',
  ' rol',
  ' punpcklqdq',
  ' pand',
  ' paddq',
  ' psrlq',
  ' por',
  ' pshufd',
  ' rep',
  ' jnp',
  ' pcmpeqd',
  ' bt',
  ' mulsd',
  ' cdq',
  ' idiv',
  ' pinsrw',
  ' punpcklbw',
  ' punpcklwd',
  ' pslld',
  ' cvtsi2sd',
  ' cvttsd2si',
  ' mul',
  ' setnp',
  ' punpckldq',
  ' adc',
  ' stc',
  ' out',
  ' scasb',
  ' in',
  ' sbb',
  ' rcl',
  ' fiadd',
  ' fcom',
  ' wait',
  ' cld',
  ' sqrtsd',
  ' psllq',
  ' xadd',
  ' pshuflw',
  ' pcmpeqb',
  ' pandn',
  ' setp',
  ' maxsd',
  ' minsd',
  ' pcmpgtd',
  ' ucomiss',
  ' repne',
  ' cvtsd2ss',
  ' cvtss2sd',
  ' cvtsi2ss',
  ' divss',
  ' cvttss2si',
  ' fld',
  ' fprem',
  ' fnstsw',
  ' cwde',
  ' bsr',
  ' div',
  ' loopne',
  ' pinsrd',
  ' psubq',
  ' mulpd',
  ' unpcklpd',
  ' pmuludq',
  ' fldz',
  ' fstp',
  ' fld1',
  ' jo',
  ' pslldq',
  ' fxch',
  ' fucomi',
  ' fucomip',
  ' ffreep',
  ' punpckhbw',
  '

### classification new instructions

In [None]:
 '''
 [' ucomisd', ->comparison_test
  ' jp', ->jump
  ' pxor', -> binary_op_spec
  ' divsd', -> special operations
  ' nop',   ->(no operation) extra 
  ' lea', -> binary_operations
  ' cdqe', -> convert 
  ' bswap', (swap) -> move bit
  ' ror', ->move bit
  ' rol', ->move bit
  ' punpcklqdq', -> extra
  ' pand', -> bin op spec
  ' paddq', -> padd -> bin op spec
  ' psrlq', ->shift
  ' por', -> bin op spec
  ' pshufd', (shuffle) move bit
  ' rep', data_movement
  ' jnp', jump
  ' pcmpeqd', comparison test
  ' bt', comp test
  ' mulsd', special op
  ' cdq', convert
  ' idiv', special op
  ' pinsrw', data move
  ' punpcklbw', data move
  ' punpcklwd', data move
  ' pslld', shift
  ' cvtsi2sd', convert
  ' cvttsd2si', convert
  ' mul', spec op
  ' setnp', condit
  ' punpckldq', extra 
  ' adc', bin op 
  ' stc', extra 
  ' out', extra 
  ' scasb', compar
  ' in', extra 
  ' sbb', bin op spec
  ' rcl', move bit
  ' fiadd', convert
  ' fcom', compar
  ' wait', extra
  ' cld', extra
  ' sqrtsd', spec oper
  ' psllq', shift
  ' xadd', bin op spec
  ' pshuflw', move bit 
  ' pcmpeqb', comp
  ' pandn', bin op spec
  ' setp',  condit
  ' maxsd', comp
  ' minsd', comp
  ' pcmpgtd', comp
  ' ucomiss', comp
  ' repne', comp
  ' cvtsd2ss', conv
  ' cvtss2sd', con
  ' cvtsi2ss', conv
  ' divss', spec op
  ' cvttss2si', conv
  ' fld', data move
  ' fprem', speci op
  ' fnstsw', data mov
  ' cwde', convert
  ' bsr', move bit 
  ' div', spec op
  ' loopne', extra
  ' pinsrd', data move
  ' psubq', cin op spec
  ' mulpd', spec op
  ' unpcklpd', data mov
  ' pmuludq', special op
  ' fldz', data mov ->fld
  ' fstp', data mov ->fst
  ' fld1',  data mov ->fld
  ' jo', jmp
  ' pslldq', shift
  ' fxch', data mov
  ' fucomi', compare
  ' fucomip', comp -> fucomi
  ' ffreep', data mov
  ' punpckhbw', data mov
  ' punpckhwd', data mov
  ' paddd', ->padd
  ' fchs', -> bin op spec
  ' cqo', convert
  ' lodsd', data mov
  ' xchg', data mov
  ' shufpd', move bit
  ' psrlw', shift
  ' packuswb', convert
  ' fcmove', cond move
  ' jno', jmp
  ' fmul', special op
  ' pshufhw', move bit
  ' paddb', ->padd
  ' fst', data mov
  ' fcmovne', cond mov
  ' fsubp']) -< bin op

  '''

## Find new registers

In [None]:

 'xmm2',
 'xmm0',
 'xmm1',
 'xmm3',
 'xmm4',
 'xmm5',
 'xmm14',
 'xmm6',
 'xmm7',
 'xmm8',
 'xmm10',
 'xmm11',
 'xmm9',
 'xmm15',
 'xmm12',
 'xmm13',

 'st(0)',
 'st(1)',
 'st(6)',


In [None]:
xmm -> register_xmm.reg
st -> register_st.reg 