In [1]:
import os
import data_loader
from numpy import trapz
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

### Tensorflow Version

In [2]:
print(tf.__version__)

2.3.1


### Global Variables

In [3]:
all_files_dir = '../data-d/'
saved_model_path = 'saved_model/'
opcode_to_int_path = "opcodeToInt.txt"
keep_amt = 30
max_opcode_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.2
recurrent_dropout_amt = 0
batch_size = 128
num_epochs = 20
test_size= 0.15       # reserve for testing

shutdown = False


### Load the Data

In [4]:
hotbar_training, renos_training, vundo_training, winwebsec_training, zbot_training = data_loader.getTrainData(all_files_dir, 
                                        keep_amt, 
                                        max_opcode_length, 
                                        opcode_to_int_path)

# Pad data
hotbar_training = pad_sequences(hotbar_training, maxlen=max_opcode_length)
renos_training = pad_sequences(renos_training, maxlen=max_opcode_length)
vundo_training = pad_sequences(vundo_training, maxlen=max_opcode_length)
winwebsec_training = pad_sequences(winwebsec_training, maxlen=max_opcode_length)
zbot_training = pad_sequences(zbot_training, maxlen=max_opcode_length)

train_set = np.concatenate((hotbar_training, renos_training, vundo_training, winwebsec_training, zbot_training), axis=0)

'''
    Create labels:
        0 - hotbar
        1 - renos
        2 - vundo
        3 - winwebsec
        4 - zbot
'''
hotbar_train_labels = np.zeros(shape=(len(hotbar_training), 1))
renos_train_labels = np.ones(shape=(len(renos_training), 1))
vundo_train_labels = np.full_like(renos_train_labels, 2)
winwebsec_train_labels = np.full_like(renos_train_labels, 3)
zbot_train_labels = np.full_like(renos_train_labels, 4)

train_labels = np.concatenate((hotbar_train_labels, 
                               renos_train_labels, 
                               vundo_train_labels, 
                               winwebsec_train_labels,
                               zbot_train_labels), axis=0)

# Reshape matrices
train_set = train_set.reshape(len(train_set), max_opcode_length, 1)
train_labels = train_labels.reshape(len(train_set), 1, 1)

# Split into training and testing data
train_set, test_set, train_labels, test_labels = train_test_split(train_set, train_labels, test_size=test_size)

print("train_set shape: {}".format(train_set.shape))
print("test_set shape: {}".format(test_set.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_labels shape: {}".format(test_labels.shape))

train_set shape: (5576, 2000, 1)
test_set shape: (984, 2000, 1)
train_labels shape: (5576, 1, 1)
test_labels shape: (984, 1, 1)


### Make the Model

In [5]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=num_lstm_unit, 
                   input_shape=(max_opcode_length, 1),
                   return_sequences=True,
                   name="lstm1"))
    model.add(Dropout(dropout_amt))
    model.add(LSTM(units=num_lstm_unit*2,
                   return_sequences=True,
                   name="lstm2"))
    model.add(Dropout(dropout_amt))
    model.add(LSTM(units=num_lstm_unit,
                   name="lstm3"))
    model.add(Dense(units=5, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [6]:
model = create_model()

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm1 (LSTM)                 (None, 2000, 16)          1152      
_________________________________________________________________
dropout (Dropout)            (None, 2000, 16)          0         
_________________________________________________________________
lstm2 (LSTM)                 (None, 2000, 32)          6272      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2000, 32)          0         
_________________________________________________________________
lstm3 (LSTM)                 (None, 16)                3136      
_________________________________________________________________
dense (Dense)                (None, 5)                 85        
Total params: 10,645
Trainable params: 10,645
Non-trainable params: 0
____________________________________________________

In [7]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=20,)

model.save_weights(saved_model_path) 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluate Model

In [8]:
scores = model.evaluate(test_set, test_labels, verbose=0, callbacks=[callback])
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 50.71%


### Load Model From Save and Evaluate

In [None]:
model = create_model()
model.load_weights(saved_model_path)



In [None]:
scores = model.evaluate(test_set, test_labels, verbose=0)
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

## Make ROC Curve

### Make scatter plot

In [None]:
# separate winwebsec and zbot test data
winwebsec_test_data = []
zbot_test_data = []

for i in range(len(test_labels)):
    if test_labels[i] == 0:
        winwebsec_test_data.append(test_set[i])
    else:
        zbot_test_data.append(test_set[i])
        
winwebsec_test_data = np.asarray(winwebsec_test_data[:192])
zbot_test_data = np.asarray(zbot_test_data[:128])

In [None]:
print(winwebsec_test_data.shape)
print(zbot_test_data.shape)


winwebsecY = model.predict(winwebsec_test_data)
winwebsecX = [i+1 for i in range(len(winwebsec_test_data))]

zbotY = model.predict(zbot_test_data)
zbotX = [i+1 for i in range(len(zbot_test_data))]

In [None]:
plt.figure(100)
f = plt.scatter(winwebsecX, winwebsecY, marker='o',
                c='darkblue', s=30, label="winwebsec")
plt.scatter(zbotX, zbotY, marker='o', c='red', s=30, label="zbot")
plt.title("Winwebsec vs. Zbot LSTM Prediction Scatter Plot",
          fontsize=18, wrap=True)
f.axes.get_xaxis().set_visible(False)
plt.ylabel("Prediction", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=12)

### Make ROC Curve

In [None]:
def sortByFirstItem(item):
    return item[0]

In [None]:
winwebsecROC = [(data, "winwebsec") for data in winwebsecY]
zbotROC = [(data, "zbot") for data in zbotY]

zbotROC.sort(key=sortByFirstItem)
winwebsecROC.sort(key=sortByFirstItem)

dataROC = zbotROC + winwebsecROC
dataROC.sort(key=sortByFirstItem, reverse=True)

In [None]:
def calculate_TPR_FPR(thresholdLine, dataROC):
    TP = 0
    FN = 0
    TN = 0
    FP = 0

    for data in dataROC:
        yVal = data[0]
        family = data[1]

        if family == "winwebsec":
            if yVal < thresholdLine:
                TP += 1
            else:
                FN += 1
        elif family == "zbot":
            if yVal > thresholdLine:
                TN += 1
            else:
                FP += 1
    TPR = TP/(TP+FN)
    FPR = 1 - (TN/(TN+FP))

    return TPR, FPR

In [None]:
def calculateAUC(rocData):
    sum = 0

    # initialization
    prevX = -1
    prevY = -1

    for points in rocData:
        curX = points[0]
        curY = points[1]

        # Skip for first point
        if prevX != -1 and prevY != -1:
            # check if rectangle
            if prevY == curY:
                sum += abs(curX - prevX) * prevY
            # check if trapezoid
            else:
                sum += (curY + prevY) * abs(curX - prevX) * 0.5

        prevX = curX
        prevY = curY

    return sum

In [None]:
rocX = list()  # used to plot
rocY = list()  # used to plot
rocData = list()    # used to calculate AUC

for entry in dataROC:
    thresholdLine = entry[0]
    TPR, FPR = calculate_TPR_FPR(thresholdLine, dataROC)

    rocX.append(FPR)
    rocY.append(TPR)
    rocData.append([FPR, TPR])

rocData.sort(key=lambda item: (item[0], item[1]), reverse=True)

AUC = round(calculateAUC(rocData), 3)

In [None]:
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.figure(200)
plt.plot(rocX, rocY, marker=".", markersize=8)
plt.title("Winwebsec vs. Zbot LSTM Log Probability ROC", fontsize=18)
plt.xlabel("FPR", fontsize=15)
plt.ylabel("TPR", fontsize=15)
plt.grid()
plt.text(x=0.75, y=0, s="AUC: {0}".format(AUC), fontsize=14, bbox=props)

# show plots
plt.show()

In [None]:
if shutdown:
    os.system('shutdown -s')