In [1]:
import tensorflow as tf
from tensorflow.python import keras
import numpy as np
from sklearn.model_selection import train_test_split
import data_loader_new
import os
from numpy import trapz
import matplotlib.pyplot as plt

from tensorflow.compat.v1.keras.backend import set_session
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
set_session(sess)

### Tensorflow Version

In [2]:
print(tf.__version__)

2.3.1


### Global Variables

In [3]:
all_files_dir = '../data/'
saved_model_path = 'saved_model/'
opcode_to_int_path = "opcodeToInt.txt"
keep_amt = 30
max_opcode_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.3
batch_size = 32
num_epochs = 20
test_size= 0.15       # reserve for testing

shutdown = False


### Load the Data

In [4]:
winwebsec_train_data, zbot_train_data = data_loader_new.getTrainData(all_files_dir, 
                                                                 keep_amt, 
                                                                 max_opcode_length, 
                                                                 opcode_to_int_path)



winwebsec_train_data = tf.keras.preprocessing.sequence.pad_sequences(winwebsec_train_data, maxlen=max_opcode_length)
zbot_train_data = tf.keras.preprocessing.sequence.pad_sequences(zbot_train_data, maxlen=max_opcode_length)

winwebsec_train_amt = 1408
zbot_train_amt = 832
winwebsec_test_amt = 384
zbot_test_amt = 256

train_set = np.concatenate((winwebsec_train_data, zbot_train_data), axis=0)

# 0 for winwebsec, 1 for zbot
winwebsec_train_labels = np.zeros(shape=(len(winwebsec_train_data), 1))
zbot_train_labels = np.ones(shape=(len(zbot_train_data), 1))

train_labels = np.concatenate((winwebsec_train_labels, zbot_train_labels), axis=0)
train_set, test_set, train_labels, test_labels = train_test_split(train_set, train_labels, test_size=test_size)

# make test and train sets divisible by 64
train_set = train_set[:2432]
train_labels = train_labels[:2432]
test_set = test_set[:384]
test_labels = test_labels[:384]

print("train_set shape: {}".format(train_set.shape))
print("test_set shape: {}".format(test_set.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_labels shape: {}".format(test_labels.shape))

train_set shape: (2432, 2000)
test_set shape: (384, 2000)
train_labels shape: (2432, 1)
test_labels shape: (384, 1)


### Make the Model

In [5]:
def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input(batch_shape=(batch_size, max_opcode_length), name="input"))
    model.add(tf.keras.layers.Embedding(input_dim=keep_amt+1,
                                        output_dim=embed_vector_length,
                                        input_length=max_opcode_length, name="embedding"))
    model.add(tf.keras.layers.Dropout(dropout_amt, name="dropout_1"))
    model.add(tf.keras.layers.LSTM(num_lstm_unit, 
                                   input_shape=(None, max_opcode_length), 
                                   stateful=True,
                                   name="lstm"))
    model.add(tf.keras.layers.Dropout(dropout_amt, name="dropout2"))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid', name="dense"))
    optimizer = tf.keras.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [6]:
model = create_model()

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, 2000, 128)           3968      
_________________________________________________________________
dropout_1 (Dropout)          (32, 2000, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (32, 16)                  9280      
_________________________________________________________________
dropout2 (Dropout)           (32, 16)                  0         
_________________________________________________________________
dense (Dense)                (32, 1)                   17        
Total params: 13,265
Trainable params: 13,265
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=5,)

model.save_weights(saved_model_path) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Evaluate Model

In [8]:
scores = model.evaluate(test_set, test_labels, verbose=0, callbacks=[callback])
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 96.88%


### Load Model From Save and Evaluate

In [None]:
model = create_model()
model.load_weights(saved_model_path)



In [None]:
scores = model.evaluate(test_set, test_labels, verbose=0)
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

## Make ROC Curve

### Make scatter plot

In [None]:
# separate winwebsec and zbot test data
winwebsec_test_data = []
zbot_test_data = []

for i in range(len(test_labels)):
    if test_labels[i] == 0:
        winwebsec_test_data.append(test_set[i])
    else:
        zbot_test_data.append(test_set[i])
        
winwebsec_test_data = np.asarray(winwebsec_test_data[:192])
zbot_test_data = np.asarray(zbot_test_data[:128])

In [None]:
print(winwebsec_test_data.shape)
print(zbot_test_data.shape)


winwebsecY = model.predict(winwebsec_test_data)
winwebsecX = [i+1 for i in range(len(winwebsec_test_data))]

zbotY = model.predict(zbot_test_data)
zbotX = [i+1 for i in range(len(zbot_test_data))]

In [None]:
plt.figure(100)
f = plt.scatter(winwebsecX, winwebsecY, marker='o',
                c='darkblue', s=30, label="winwebsec")
plt.scatter(zbotX, zbotY, marker='o', c='red', s=30, label="zbot")
plt.title("Winwebsec vs. Zbot LSTM Prediction Scatter Plot",
          fontsize=18, wrap=True)
f.axes.get_xaxis().set_visible(False)
plt.ylabel("Prediction", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=12)

### Make ROC Curve

In [None]:
def sortByFirstItem(item):
    return item[0]

In [None]:
winwebsecROC = [(data, "winwebsec") for data in winwebsecY]
zbotROC = [(data, "zbot") for data in zbotY]

zbotROC.sort(key=sortByFirstItem)
winwebsecROC.sort(key=sortByFirstItem)

dataROC = zbotROC + winwebsecROC
dataROC.sort(key=sortByFirstItem, reverse=True)

In [None]:
def calculate_TPR_FPR(thresholdLine, dataROC):
    TP = 0
    FN = 0
    TN = 0
    FP = 0

    for data in dataROC:
        yVal = data[0]
        family = data[1]

        if family == "winwebsec":
            if yVal < thresholdLine:
                TP += 1
            else:
                FN += 1
        elif family == "zbot":
            if yVal > thresholdLine:
                TN += 1
            else:
                FP += 1
    TPR = TP/(TP+FN)
    FPR = 1 - (TN/(TN+FP))

    return TPR, FPR

In [None]:
def calculateAUC(rocData):
    sum = 0

    # initialization
    prevX = -1
    prevY = -1

    for points in rocData:
        curX = points[0]
        curY = points[1]

        # Skip for first point
        if prevX != -1 and prevY != -1:
            # check if rectangle
            if prevY == curY:
                sum += abs(curX - prevX) * prevY
            # check if trapezoid
            else:
                sum += (curY + prevY) * abs(curX - prevX) * 0.5

        prevX = curX
        prevY = curY

    return sum

In [None]:
rocX = list()  # used to plot
rocY = list()  # used to plot
rocData = list()    # used to calculate AUC

for entry in dataROC:
    thresholdLine = entry[0]
    TPR, FPR = calculate_TPR_FPR(thresholdLine, dataROC)

    rocX.append(FPR)
    rocY.append(TPR)
    rocData.append([FPR, TPR])

rocData.sort(key=lambda item: (item[0], item[1]), reverse=True)

AUC = round(calculateAUC(rocData), 3)

In [None]:
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.figure(200)
plt.plot(rocX, rocY, marker=".", markersize=8)
plt.title("Winwebsec vs. Zbot LSTM Log Probability ROC", fontsize=18)
plt.xlabel("FPR", fontsize=15)
plt.ylabel("TPR", fontsize=15)
plt.grid()
plt.text(x=0.75, y=0, s="AUC: {0}".format(AUC), fontsize=14, bbox=props)

# show plots
plt.show()

In [None]:
if shutdown:
    os.system('shutdown -s')