In [1]:
import tensorflow as tf
import numpy as np
import process

## Verify version

In [2]:
print(tf.__version__)

2.0.0


## Global Variables

In [3]:
malware_path = "../data/malware/"
benign_path = "../data/benign/"
keep_amt = 29
last_key = "other"

max_opcode_len = 9000
lstm_num_units = 128
batch_size = 64
num_epochs = 10
checkpoint_path = 'training_checkpoint.keras'
log_dir = 'logs/'

In [4]:
malware_files, benign_files, most_common_opcodes = process.get_most_common_opcodes(
                                                                    keep_amt=keep_amt, 
                                                                    last_key = last_key, 
                                                                    malware_path = malware_path,
                                                                    benign_path = benign_path)

print(most_common_opcodes)

{'mov': 0, 'push': 1, 'add': 2, 'call': 3, 'cmp': 4, 'jmp': 5, 'xor': 6, 'pop': 7, 'jz': 8, 'jnz': 9, 'lea': 10, 'sub': 11, 'test': 12, 'retn': 13, 'or': 14, 'and': 15, 'inc': 16, 'nop': 17, 'dec': 18, 'shr': 19, 'movzx': 20, 'jb': 21, 'sbb': 22, 'adc': 23, 'shl': 24, 'leave': 25, 'imul': 26, 'jnb': 27, 'jbe': 28, 'other': 29}


In [5]:
def populate_train_set(file_names):
    longest_len = -1
    file_name = ""
    train_set = list()

    # populate mal_train_set
    for file in file_names:
        file_opcodes = list()
        with open(file, 'r') as f:
            opcode = f.readline()

            while opcode:
                arr = np.zeros(keep_amt+1, dtype=np.float)
                opcode = opcode.strip()

                if opcode in most_common_opcodes:
                    arr[most_common_opcodes[opcode]] = 1
                else:
                    arr[most_common_opcodes[last_key]] = 1

                file_opcodes.append(arr)

                opcode = f.readline()
        
        # only add to train set if opcode sequence less than amount specified
        if len(file_opcodes) <= max_opcode_len:
            train_set.append(file_opcodes)
        
            if len(file_opcodes) > longest_len:
                longest_len = len(file_opcodes)
                file_name = file
    print("{} {}".format(file_name, longest_len))

    return train_set, longest_len

In [6]:
print("longest opcode sequences: ")

mal_train_set, mal_longest_len = populate_train_set(malware_files)
mal_labels = [1 for x in range(len(mal_train_set))]

ben_train_set, ben_longest_len = populate_train_set(benign_files)
ben_labels = [0 for x in range(len(ben_train_set))]

train_set = mal_train_set + ben_train_set
train_labels = mal_labels + ben_labels
longest_len = max(mal_longest_len, ben_longest_len)

# longest file: ../data/benign/linux_opcodes\busybox.asm.txt  360169    <---- this was deleted

longest opcode sequences: 
../data/malware/zeroaccess\7921ddb79bea40f457bf2441617070edc77d7e35.asm.txt 8877
../data/benign/linux_opcodes\ps.asm.txt 8740


In [7]:
# pad train_set to longest file length

for data in train_set:
    opcodes_list = data
    opcodes_len = len(data)
    
    for i in range(longest_len - opcodes_len):
        data.append(np.zeros(keep_amt+1, dtype=np.float))

In [8]:
print(len(train_set))
print(len(train_set[0]))
print(len(train_set[0][0]))

7596
8877
30


## Make the model

In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(units=lstm_num_units, input_shape=(len(train_set[0]), len(train_set[0][0]))))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [10]:
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         monitor='val_loss',
                                                         verbose=1,
                                                         save_weights_only=True,
                                                         save_best_only=True)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                           patience=3,
                                                           verbose=1)

callback_tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                      histogram_freq=0,
                                                      write_graph=False)

callbacks = [callback_checkpoint, 
             callback_early_stopping, 
             callback_tensorboard]

In [14]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=num_epochs,
          verbose=2,
          callbacks=callbacks,
          shuffle=True,
          validation_split=0.1)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'numpy.ndarray\'>"})'}), (<class 'list'> containing values of types {"<class 'int'>"})

In [None]:
print("hi")