In [1]:
import tensorflow as tf
from tensorflow.python import keras
import numpy as np
from sklearn.model_selection import train_test_split
import data_loader

## Verify version

In [2]:
print(keras.__version__)
print(tf.__version__)

2.2.4-tf
2.0.0


## Global Variables

In [3]:
allFilesDir = '../data/samples/'
malFamFileDir = '../data/DB_RELEASE1.0.sql'
malwareDir = allFilesDir + "malware/"
benignDir = allFilesDir + "benign/"
keepAmt = 29
lastKey = "other"
numberToClassify = 2

maxOpcodeLen = 10000
embedding_vector_len = 64
lstm_num_units = 150
batch_size = 64
num_epochs = 20
test_size= 0.2
checkpoint_path = 'training_checkpoint.keras'
log_dir = 'logs/'

In [4]:
# 0 for winwebsec, 1 for zbot
trainingData, numLabels = data_loader.getTrainData_malware(malFamFileDir, 
                                                           allFilesDir,
                                                           malwareDir,
                                                           maxOpcodeLen, 
                                                           lastKey,
                                                           numberToClassify,
                                                           keepAmt)





In [None]:
print(numLabels)

In [15]:
# labels: 1 for malware, 0 for benign

mal_train_set = populate_train_set(mal_handler)
mal_labels = np.ones( shape=(len(mal_train_set),1) )

ben_train_set = populate_train_set(ben_handler)
ben_labels = np.zeros( shape=(len(ben_train_set),1) )

mal_train_set = mal_train_set[:200]
mal_labels = mal_labels[:200]

train_set = np.concatenate((mal_train_set, ben_train_set), axis=0)
train_set = tf.keras.preprocessing.sequence.pad_sequences(train_set, maxlen=max_opcode_len)
train_labels = np.concatenate((mal_labels, ben_labels), axis=0)

train_set, test_set, train_labels, test_labels = train_test_split(train_set, train_labels, test_size=test_size)

print(len(mal_train_set))
print(len(ben_train_set))
print("train_set shape: {}".format(train_set.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_set shape: {}".format(test_set.shape))
print("test_labels shape: {}".format(test_labels.shape))

200
130
train_set shape: (264, 10000)
train_labels shape: (264, 1)
test_set shape: (66, 10000)
test_labels shape: (66, 1)


## Make the model

In [16]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=keep_amt+1,
                                    output_dim=embedding_vector_len,
                                    input_length=max_opcode_len))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(lstm_num_units))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))


optimizer = tf.keras.optimizers.Adam()

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [17]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=num_epochs,)

Train on 264 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x23d017a6f88>

## Test the model

In [18]:
scores = model.evaluate(test_set, test_labels, verbose=0)
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 96.97%
