In [None]:
import sys
sys.path.insert(1, '..\\..\\')

import os
import data_loader
from numpy import trapz
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

### Tensorflow Version

In [None]:
print(tf.__version__)

### Global Variables

In [None]:
malware_data_dir = '../../data/'
saved_model_path = 'saved_model/'
#opcode_to_int_path = "opcodeToInt.txt"
num_unique_opcodes = 30
max_opcode_sequence_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.3
batch_size = 32
num_epochs = 100
test_size= 0.15       # reserve for testing
results_path = "results.txt"
#num_families_to_use = 20

shutdown = False

In [None]:
 def split_data(train_data_raw, train_labels_raw):
    # Split into training and testing data
    train_data, test_data, train_labels, test_labels = train_test_split(train_data_raw, train_labels_raw, test_size=test_size)

    # Make divisible by batch size
    num_data_train = int(len(train_data)/batch_size) * batch_size
    num_data_test = int(len(test_data)/batch_size) * batch_size

    train_data = train_data[:num_data_train]
    train_labels = train_labels[:num_data_train]
    test_data = test_data[:num_data_test]
    test_labels = test_labels[:num_data_test]

#     print("train_data shape: {}".format(train_data.shape))
#     print("test_data shape: {}".format(test_data.shape))
#     print("train_labels shape: {}".format(train_labels.shape))
#     print("test_labels shape: {}".format(test_labels.shape))

    return train_data, test_data, train_labels, test_labels

In [None]:
def create_model(num_families_to_use):
    model = Sequential()
    model.add(LSTM(units=num_lstm_unit, 
                   input_shape=(max_opcode_sequence_length, 1),
                   name="lstm1"))
    model.add(Dropout(dropout_amt))
    model.add(Dense(units=num_families_to_use, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [None]:
num_families_to_use = 20
results = {}

opcode_to_int_path = "opcodeToInt_" + str(num_families_to_use) + ".txt"
# Get train data
raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                          num_families_to_use, 
                                          num_unique_opcodes, 
                                          max_opcode_sequence_length, 
                                          opcode_to_int_path)
# Data preprocessing
family_names = list(raw_train_data.keys())
print(family_names)

# Split opcode family data in individual lists
train_data = list()
for family, data in raw_train_data.items():
    train_data.append(data)

# Pad training data to ensure uniformity
padded_train_data = list()
for family_opcodes in train_data:
    padded_sequence = pad_sequences(family_opcodes, 
                                    maxlen=max_opcode_sequence_length)
    padded_train_data.append(padded_sequence)

# Concatenate all training data into 1 long list instead of multiple lists
train_data_raw = np.concatenate(padded_train_data)

print(len(train_data))

# Make labels
train_labels = []
for count, data in enumerate(padded_train_data):
    labels_list = np.full(shape=(len(data)), fill_value=count)
    train_labels.append(labels_list)

train_labels_raw = np.concatenate(train_labels)

train_data_raw = train_data_raw.reshape(len(train_data_raw), max_opcode_sequence_length, 1)
train_labels_raw = train_labels_raw.reshape(len(train_data_raw), 1, 1)

# get train and test data
train_data, test_data, train_labels, test_labels = split_data(train_data_raw, train_labels_raw)

# train model
model_train = create_model(num_families_to_use)
early_stopping = EarlyStopping(monitor='loss', 
                               verbose=1, 
                               patience=2,
                               restore_best_weights=True,
                               min_delta=0.03)
history = model_train.fit(x=train_data,
                          y=train_labels,
                          batch_size=batch_size,
                          callbacks = [early_stopping],
                          epochs=num_epochs,
                          shuffle=True)

model_train.save_weights(saved_model_path) 


#             scores = model_evaluate.evaluate(test_data, test_labels, verbose=0, callbacks = [early_stopping])
#             accuracy = scores[1]*100
#             print("{0}: {1}".format(num_families_to_use, accuracy))
#             results[num_families_to_use] = accuracy

#             with open(results_path, 'a') as file:
#                 file.write(str(accuracy) + "\n")

In [None]:
model = create_model(num_families_to_use)
model.load_weights(saved_model_path)

In [None]:
predictions = model.predict_classes(test_data, verbose=1)
labels = test_labels.reshape((-1))

In [None]:
matrix = confusion_matrix(y_true=labels,
                          y_pred=predictions)

fig, ax = plot_confusion_matrix(conf_mat=matrix,
                                show_absolute=False,
                                show_normed=True,
                                class_names=family_names,
                                figsize=(12,12))

for item in (ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(10)


ax.set_title("Confusion Matrix for LSTM Model Without Embedding", fontsize=15, fontweight='bold')
ax.set_ylabel("True Family", fontsize=12, fontweight='bold')
ax.set_xlabel("Predicted Family", fontsize=12, fontweight='bold')



### Load the Data

In [None]:
hotbar_training, renos_training, vundo_training, winwebsec_training, zbot_training = data_loader.getTrainData(all_files_dir, 
                                        keep_amt, 
                                        max_opcode_length, 
                                        opcode_to_int_path)

# Pad data
hotbar_training = pad_sequences(hotbar_training, maxlen=max_opcode_length)
renos_training = pad_sequences(renos_training, maxlen=max_opcode_length)
vundo_training = pad_sequences(vundo_training, maxlen=max_opcode_length)
winwebsec_training = pad_sequences(winwebsec_training, maxlen=max_opcode_length)
zbot_training = pad_sequences(zbot_training, maxlen=max_opcode_length)

train_set = np.concatenate((hotbar_training, renos_training, vundo_training, winwebsec_training, zbot_training), axis=0)

'''
    Create labels:
        0 - hotbar
        1 - renos
        2 - vundo
        3 - winwebsec
        4 - zbot
'''
hotbar_train_labels = np.zeros(shape=(len(hotbar_training), 1))
renos_train_labels = np.ones(shape=(len(renos_training), 1))
vundo_train_labels = np.full_like(renos_train_labels, 2)
winwebsec_train_labels = np.full_like(renos_train_labels, 3)
zbot_train_labels = np.full_like(renos_train_labels, 4)

train_labels = np.concatenate((hotbar_train_labels, 
                               renos_train_labels, 
                               vundo_train_labels, 
                               winwebsec_train_labels,
                               zbot_train_labels), axis=0)

# Reshape matrices
train_set = train_set.reshape(len(train_set), max_opcode_length, 1)
train_labels = train_labels.reshape(len(train_set), 1, 1)

# Split into training and testing data
train_set, test_set, train_labels, test_labels = train_test_split(train_set, train_labels, test_size=test_size)

print("train_set shape: {}".format(train_set.shape))
print("test_set shape: {}".format(test_set.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_labels shape: {}".format(test_labels.shape))

### Make the Model

In [None]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=num_lstm_unit, 
                   input_shape=(max_opcode_length, 1),
                   return_sequences=True,
                   name="lstm1"))
    model.add(Dropout(dropout_amt))
    model.add(LSTM(units=num_lstm_unit*2,
                   return_sequences=True,
                   name="lstm2"))
    model.add(Dropout(dropout_amt))
    model.add(LSTM(units=num_lstm_unit,
                   name="lstm3"))
    model.add(Dense(units=5, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [None]:
model = create_model()

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [None]:
model.fit(x=train_set,
          y=train_labels,
          batch_size=batch_size,
          epochs=20,)

model.save_weights(saved_model_path) 

### Evaluate Model

In [None]:
scores = model.evaluate(test_set, test_labels, verbose=0, callbacks=[callback])
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

### Load Model From Save and Evaluate

In [None]:
model = create_model()
model.load_weights(saved_model_path)



In [None]:
scores = model.evaluate(test_set, test_labels, verbose=0)
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

## Make ROC Curve

### Make scatter plot

In [None]:
# separate winwebsec and zbot test data
winwebsec_test_data = []
zbot_test_data = []

for i in range(len(test_labels)):
    if test_labels[i] == 0:
        winwebsec_test_data.append(test_set[i])
    else:
        zbot_test_data.append(test_set[i])
        
winwebsec_test_data = np.asarray(winwebsec_test_data[:192])
zbot_test_data = np.asarray(zbot_test_data[:128])

In [None]:
print(winwebsec_test_data.shape)
print(zbot_test_data.shape)


winwebsecY = model.predict(winwebsec_test_data)
winwebsecX = [i+1 for i in range(len(winwebsec_test_data))]

zbotY = model.predict(zbot_test_data)
zbotX = [i+1 for i in range(len(zbot_test_data))]

In [None]:
plt.figure(100)
f = plt.scatter(winwebsecX, winwebsecY, marker='o',
                c='darkblue', s=30, label="winwebsec")
plt.scatter(zbotX, zbotY, marker='o', c='red', s=30, label="zbot")
plt.title("Winwebsec vs. Zbot LSTM Prediction Scatter Plot",
          fontsize=18, wrap=True)
f.axes.get_xaxis().set_visible(False)
plt.ylabel("Prediction", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=12)

### Make ROC Curve

In [None]:
def sortByFirstItem(item):
    return item[0]

In [None]:
winwebsecROC = [(data, "winwebsec") for data in winwebsecY]
zbotROC = [(data, "zbot") for data in zbotY]

zbotROC.sort(key=sortByFirstItem)
winwebsecROC.sort(key=sortByFirstItem)

dataROC = zbotROC + winwebsecROC
dataROC.sort(key=sortByFirstItem, reverse=True)

In [None]:
def calculate_TPR_FPR(thresholdLine, dataROC):
    TP = 0
    FN = 0
    TN = 0
    FP = 0

    for data in dataROC:
        yVal = data[0]
        family = data[1]

        if family == "winwebsec":
            if yVal < thresholdLine:
                TP += 1
            else:
                FN += 1
        elif family == "zbot":
            if yVal > thresholdLine:
                TN += 1
            else:
                FP += 1
    TPR = TP/(TP+FN)
    FPR = 1 - (TN/(TN+FP))

    return TPR, FPR

In [None]:
def calculateAUC(rocData):
    sum = 0

    # initialization
    prevX = -1
    prevY = -1

    for points in rocData:
        curX = points[0]
        curY = points[1]

        # Skip for first point
        if prevX != -1 and prevY != -1:
            # check if rectangle
            if prevY == curY:
                sum += abs(curX - prevX) * prevY
            # check if trapezoid
            else:
                sum += (curY + prevY) * abs(curX - prevX) * 0.5

        prevX = curX
        prevY = curY

    return sum

In [None]:
rocX = list()  # used to plot
rocY = list()  # used to plot
rocData = list()    # used to calculate AUC

for entry in dataROC:
    thresholdLine = entry[0]
    TPR, FPR = calculate_TPR_FPR(thresholdLine, dataROC)

    rocX.append(FPR)
    rocY.append(TPR)
    rocData.append([FPR, TPR])

rocData.sort(key=lambda item: (item[0], item[1]), reverse=True)

AUC = round(calculateAUC(rocData), 3)

In [None]:
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.figure(200)
plt.plot(rocX, rocY, marker=".", markersize=8)
plt.title("Winwebsec vs. Zbot LSTM Log Probability ROC", fontsize=18)
plt.xlabel("FPR", fontsize=15)
plt.ylabel("TPR", fontsize=15)
plt.grid()
plt.text(x=0.75, y=0, s="AUC: {0}".format(AUC), fontsize=14, bbox=props)

# show plots
plt.show()

In [None]:
if shutdown:
    os.system('shutdown -s')