In [5]:
import tensorflow as tf
from tensorflow.python import keras
import numpy as np
from sklearn.model_selection import train_test_split
import data_loader

## Verify version

In [6]:
print(keras.__version__)
print(tf.__version__)

2.2.4-tf
2.0.0


## Global Variables

In [7]:
allFilesDir = '../data/samples/'
malFamFileDir = '../data/DB_RELEASE1.0.sql'
malwareDir = allFilesDir + "malware/"
benignDir = allFilesDir + "benign/"
keepAmt = 29
lastKey = "other"
numberToClassify = 2

maxOpcodeLen = 10000
embedVectorLen = 64
num_lstm_units = 150
batch_size = 64
num_epochs = 20
test_size= 0.2
checkpoint_path = 'training_checkpoint.keras'
log_dir = 'logs/'

## Load the Data

In [8]:
# 0 for winwebsec, 1 for zbot
trainData, numLabels = data_loader.getTrainData_malware(malFamFileDir, 
                                                           allFilesDir,
                                                           malwareDir,
                                                           maxOpcodeLen, 
                                                           lastKey,
                                                           numberToClassify,
                                                           keepAmt)

print(len(trainData))
print(numLabels)

4272
2136


## Prepare the Data

In [14]:
trainSet = list()

for t in trainData:
    data = list()
    for i in t:
        data.append( i/float(keepAmt+1) )
    trainSet.append(data)

In [15]:
trainSet = tf.keras.preprocessing.sequence.pad_sequences(trainSet, maxlen=maxOpcodeLen, dtype='float', value=0.0)

labels0 = np.zeros(shape=(numLabels,1))
labels1 = np.ones(shape=(numLabels,1))
trainLabels = np.concatenate((labels0, labels1), axis=0)

trainSet, testSet, trainLabels, testLabels = train_test_split(trainSet, trainLabels, test_size=test_size)

# reshape trainSet and testSet into 3D arrays
trainSet = trainSet[:, :, np.newaxis]
testSet = testSet[:, :, np.newaxis]


print("train_set shape: {}".format(trainSet.shape))
print("train_labels shape: {}".format(trainLabels.shape))
print("test_set shape: {}".format(testSet.shape))
print("test_labels shape: {}".format(testLabels.shape))
print(trainSet[0])

train_set shape: (3417, 10000, 1)
train_labels shape: (3417, 1)
test_set shape: (855, 10000, 1)
test_labels shape: (855, 1)
[[0.        ]
 [0.        ]
 [0.        ]
 ...
 [0.16666667]
 [0.16666667]
 [0.16666667]]


## Make the model

In [16]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.LSTM(num_lstm_units))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))


optimizer = tf.keras.optimizers.Adam()

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [17]:
model.fit(x=trainSet,
          y=trainLabels,
          batch_size=batch_size,
          epochs=num_epochs,)

Train on 3417 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1ab805d84c8>

## Test the model

In [18]:
scores = model.evaluate(testSet, testLabels, verbose=0)
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 91.11%
