In [27]:
import tensorflow as tf
from tensorflow.python import keras
import numpy as np
import process
from file_handler import File_Handler

## Verify version

In [28]:
print(keras.__version__)
print(tf.__version__)

2.2.4-tf
2.0.0


## Global Variables

In [3]:
malware_path = "../data/malware/"
benign_path = "../data/benign/"
keep_amt = 29
last_key = "other"

max_opcode_len = 9000
min_opcode_len = 1000
lstm_num_units = 128
batch_size = 2
num_epochs = 10
checkpoint_path = 'training_checkpoint.keras'
log_dir = 'logs/'

In [4]:
mal_handler, ben_handler, most_common_opcodes = process.get_most_common_opcodes(
                                                                    keep_amt = keep_amt, 
                                                                    last_key = last_key, 
                                                                    malware_path = malware_path,
                                                                    benign_path = benign_path,
                                                                    max_opcode_len = max_opcode_len)

longest_len = max(mal_handler.get_longest_opcode_seq(), ben_handler.get_longest_opcode_seq())

print(most_common_opcodes)
print(longest_len)

{'mov': 0, 'push': 1, 'add': 2, 'call': 3, 'cmp': 4, 'jmp': 5, 'xor': 6, 'pop': 7, 'jz': 8, 'jnz': 9, 'lea': 10, 'sub': 11, 'test': 12, 'retn': 13, 'or': 14, 'and': 15, 'inc': 16, 'nop': 17, 'dec': 18, 'shr': 19, 'movzx': 20, 'jb': 21, 'sbb': 22, 'adc': 23, 'shl': 24, 'leave': 25, 'imul': 26, 'jnb': 27, 'jbe': 28, 'other': 29}
8877


In [5]:
def populate_train_set(file_handler, longest_len):
    num_files = file_handler.get_num_files()
    train_set = np.zeros(shape=(num_files, longest_len, keep_amt+1), dtype=np.float)

    # populate train_set
    for file_counter,file in enumerate(file_handler.get_files()):
        opcode_counter = 0
        with open(file, 'r') as f:
            opcode = f.readline()
            while opcode:
                opcode = opcode.strip()
                if opcode in most_common_opcodes:
                    opcode_key = most_common_opcodes[opcode]
                else:
                    opcode_key = most_common_opcodes[last_key]
                
                train_set[file_counter][opcode_counter][opcode_key] = 1.0
                opcode_counter += 1
                opcode = f.readline()

    return train_set

In [6]:
# labels: 1 for malware, 0 for benign
longest_len = max(mal_handler.get_longest_opcode_seq(), ben_handler.get_longest_opcode_seq())

mal_train_set = populate_train_set(mal_handler, longest_len)
mal_labels = np.ones( shape=(len(mal_train_set),1) )

ben_train_set = populate_train_set(ben_handler, longest_len)
ben_labels = np.zeros( shape=(len(ben_train_set),1) )

train_set = np.concatenate((mal_train_set, ben_train_set), axis=0)
train_labels = np.concatenate((mal_labels, ben_labels), axis=0)

print("train_set shape: {}".format(train_set.shape))
print("train_labels shape: {}".format(train_labels.shape))

train_set shape: (7596, 8877, 30)
train_labels shape: (7596, 1)


In [7]:
print(train_labels)

[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]


## Make the model

In [19]:
num_opcodes_per_file = len(train_set[0])
num_opcodes = keep_amt+1

model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(units=lstm_num_units, input_shape=(num_opcodes_per_file, num_opcodes), activation='sigmoid'))
model.add(tf.keras.layers.Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam()
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [30]:
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         monitor='val_loss',
                                                         verbose=1,
                                                         save_weights_only=True,
                                                         save_best_only=True)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                           patience=3,
                                                           verbose=1)

callback_tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                      histogram_freq=0,
                                                      write_graph=False)

callbacks = [callback_checkpoint,  
             callback_tensorboard]

In [31]:
validation_split = 760 / len(train_set)

validation_split

0.10005265929436545

In [None]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=num_epochs,
          verbose=1,
          callbacks=callbacks,
          shuffle=True,
          validation_split=validation_split)

Train on 7596 samples
Epoch 1/10
   2/7596 [..............................] - ETA: 11:20:16 - loss: 0.3455 - accuracy: 1.0000

In [None]:
print("hi")