In [1]:
import tensorflow as tf
from tensorflow.python import keras
import numpy as np
import process
from file_handler import File_Handler

## Verify version

In [2]:
print(keras.__version__)
print(tf.__version__)

2.2.4-tf
2.0.0


## Global Variables

In [3]:
malware_path = "../data/malware/"
benign_path = "../data/benign/"
keep_amt = 30
last_key = "other"

max_opcode_len = 500
lstm_num_units = 128
batch_size = 20
num_epochs = 10
checkpoint_path = 'training_checkpoint.keras'
log_dir = 'logs/'

In [5]:
mal_handler, ben_handler, most_common_opcodes = process.get_most_common_opcodes(
                                                                    keep_amt = keep_amt, 
                                                                    last_key = last_key, 
                                                                    malware_path = malware_path,
                                                                    benign_path = benign_path)

longest_len = max(mal_handler.get_longest_opcode_seq(), ben_handler.get_longest_opcode_seq())

print(most_common_opcodes)
print(longest_len)

{'mov': 0, 'push': 1, 'add': 2, 'call': 3, 'cmp': 4, 'jmp': 5, 'xor': 6, 'pop': 7, 'jz': 8, 'jnz': 9, 'lea': 10, 'sub': 11, 'test': 12, 'retn': 13, 'or': 14, 'and': 15, 'inc': 16, 'nop': 17, 'dec': 18, 'shr': 19, 'movzx': 20, 'jb': 21, 'sbb': 22, 'adc': 23, 'shl': 24, 'leave': 25, 'imul': 26, 'jnb': 27, 'jbe': 28, 'xchg': 29, 'other': 30}
360169


In [5]:
def populate_train_set(file_handler):
    num_files = file_handler.get_num_files()
    train_set = list()

    # populate train_set
    for file_counter,file in enumerate(file_handler.get_files()):
        opcode_counter = 0
        counter = 0
        with open(file, 'r') as f:
            file_copdes = list()
            opcode = f.readline()
            while opcode and counter < max_opcode_len:
                opcode = opcode.strip()
                if opcode in most_common_opcodes:
                    opcode_key = most_common_opcodes[opcode]
                else:
                    opcode_key = most_common_opcodes[last_key]
                
                train_set[file_counter][opcode_counter][opcode_key] = 1.0
                opcode_counter += 1
                opcode = f.readline()

    return train_set

In [6]:
# labels: 1 for malware, 0 for benign
longest_len = max(mal_handler.get_longest_opcode_seq(), ben_handler.get_longest_opcode_seq())

mal_train_set = populate_train_set(mal_handler, longest_len)
mal_labels = np.ones( shape=(len(mal_train_set),1) )

ben_train_set = populate_train_set(ben_handler, longest_len)
ben_labels = np.zeros( shape=(len(ben_train_set),1) )

train_set = np.concatenate((mal_train_set, ben_train_set), axis=0)
train_labels = np.concatenate((mal_labels, ben_labels), axis=0)

print("train_set shape: {}".format(train_set.shape))
print("train_labels shape: {}".format(train_labels.shape))

train_set shape: (7596, 8877, 30)
train_labels shape: (7596, 1)


In [7]:
print(train_labels)

[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [0.]
 [0.]]


## Make the model

In [10]:
# num_opcodes_per_file = len(train_set[0])
# num_opcodes = keep_amt+1


import numpy
import tensorflow as tf
# fix random seed for reproducibility
numpy.random.seed(7)

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=top_words)

print(X_train)

# # truncate and pad input sequences
max_review_length = 500
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_review_length)

print(X_train)



# # create the model
# embedding_vecor_length = 32
# model = tf.keras.models.Sequential()
# #model.add(tf.keras.layers.Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
# model.add(tf.keras.layers.LSTM(100))
# model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)





# model = tf.keras.Sequential()
# model.add(tf.keras.layers.LSTM(units=lstm_num_units, input_shape=(num_opcodes_per_file, num_opcodes), activation='sigmoid'))
# model.add(tf.keras.layers.Dense(1, activation='softmax'))

# optimizer = tf.keras.optimizers.Adam()
# model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['mse'])

[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32])
 list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 

In [20]:
callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         monitor='val_loss',
                                                         verbose=1,
                                                         save_weights_only=True,
                                                         save_best_only=True)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                           patience=3,
                                                           verbose=1)

callback_tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                      histogram_freq=0,
                                                      write_graph=False)

callbacks = [callback_checkpoint]

In [21]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=num_epochs,
          verbose=1,
          callbacks=callbacks,
          shuffle=True,
          validation_split=0.05)

Train on 7216 samples, validate on 380 samples
Epoch 1/10


KeyboardInterrupt: 

In [None]:
print("hi")