In [1]:
import sys
sys.path.insert(1, '..\\..\\')

import os
import data_loader
from numpy import trapz
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

### Tensorflow Version

In [2]:
print(tf.__version__)

2.3.1


### Global Variables

In [3]:
malware_data_dir = '../../data/'
saved_model_path = 'saved_model/'
opcode_to_int_path = "opcodeToInt.txt"
num_unique_opcodes = 30
max_opcode_sequence_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
num_dense_unit = 50
dropout_amt = 0.3
batch_size = 64
num_epochs = 20
test_size= 0.15       # reserve for testing
num_families_to_use = 5

shutdown = False

### Load the Data

In [4]:
raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                          num_families_to_use, 
                                          num_unique_opcodes, 
                                          max_opcode_sequence_length, 
                                          opcode_to_int_path)

Getting list of paths to training data
{'winwebsec': 6862260, 'vundo': 3492760, 'zbot': 3256944, 'hotbar': 2952000, 'renos': 2612858}
Loading training data for hotbar
1476
Loading training data for renos
1309
Loading training data for vundo
1784
Loading training data for winwebsec
3651
Loading training data for zbot
1785
All training data loaded


### Data preprocessing

In [5]:
family_names = list(raw_train_data.keys())
print(family_names)

# Split opcode family data in individual lists
train_data = list()
for family, data in raw_train_data.items():
    train_data.append(data)
    
# Pad training data to ensure uniformity
padded_train_data = list()
for family_opcodes in train_data:
    padded_sequence = pad_sequences(family_opcodes, 
                                    maxlen=max_opcode_sequence_length)
    padded_train_data.append(padded_sequence)
    print(len(padded_sequence))
    
# Concatenate all training data into 1 long list instead of multiple lists
train_data = np.concatenate(padded_train_data)

print(len(train_data))

['hotbar', 'renos', 'vundo', 'winwebsec', 'zbot']
1476
1309
1784
3651
1785
10005


### Make the labels

In [6]:
train_labels = []

for count, data in enumerate(padded_train_data):
    labels_list = np.full(shape=(len(data)), fill_value=count)
    train_labels.append(labels_list)

train_labels = np.concatenate(train_labels)

### Reshape and split into training and testing sets

In [7]:
# Reshape matrices
train_data = train_data.reshape(len(train_data), max_opcode_sequence_length, 1)
train_labels = train_labels.reshape(len(train_data), 1, 1)

# Split into training and testing data
train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels, test_size=test_size)

print("train_data shape: {}".format(train_data.shape))
print("test_data shape: {}".format(test_data.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_labels shape: {}".format(test_labels.shape))

train_data shape: (8504, 2000, 1)
test_data shape: (1501, 2000, 1)
train_labels shape: (8504, 1, 1)
test_labels shape: (1501, 1, 1)


### Make the Model

In [11]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=num_lstm_unit, 
                   input_shape=(max_opcode_sequence_length, 1),
                   #return_sequences=True,
                   name="lstm1"))
    model.add(Dropout(dropout_amt))
#     model.add(LSTM(units=num_lstm_unit*2,
#                    return_sequences=True,
#                    name="lstm2"))
#     model.add(Dropout(dropout_amt))
#     model.add(LSTM(units=num_lstm_unit,
#                    name="lstm3"))
    model.add(Dense(units=5, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [12]:
model = create_model()

early_stopping = EarlyStopping(monitor='loss', verbose=1, patience=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm1 (LSTM)                 (None, 16)                1152      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 85        
Total params: 1,237
Trainable params: 1,237
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(x=train_data,
          y=train_labels,
          batch_size=batch_size,
          callbacks=[early_stopping],
          epochs=20,)

model.save_weights(saved_model_path) 

Epoch 1/20
 14/133 [==>...........................] - ETA: 7s - loss: 1.7804 - accuracy: 0.2188

KeyboardInterrupt: 

### Evaluate Model

In [11]:
scores = model.evaluate(test_data, test_labels, verbose=0, callbacks=[early_stopping])
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 42.38%


### Load Model From Save and Evaluate

In [12]:
model = create_model()
model.load_weights(saved_model_path)



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm1 (LSTM)                 (None, 16)                1152      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 85        
Total params: 1,237
Trainable params: 1,237
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x227c89e9340>

In [13]:
scores = model.evaluate(test_data, test_labels, verbose=0)
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

Accuracy: 42.38%


## Make ROC Curve

### Make scatter plot

In [14]:
# separate winwebsec and zbot test data
winwebsec_test_data = []
zbot_test_data = []

for i in range(len(test_labels)):
    if test_labels[i] == 0:
        winwebsec_test_data.append(test_set[i])
    else:
        zbot_test_data.append(test_set[i])
        
winwebsec_test_data = np.asarray(winwebsec_test_data[:192])
zbot_test_data = np.asarray(zbot_test_data[:128])

NameError: name 'test_set' is not defined

In [None]:
print(winwebsec_test_data.shape)
print(zbot_test_data.shape)


winwebsecY = model.predict(winwebsec_test_data)
winwebsecX = [i+1 for i in range(len(winwebsec_test_data))]

zbotY = model.predict(zbot_test_data)
zbotX = [i+1 for i in range(len(zbot_test_data))]

In [None]:
plt.figure(100)
f = plt.scatter(winwebsecX, winwebsecY, marker='o',
                c='darkblue', s=30, label="winwebsec")
plt.scatter(zbotX, zbotY, marker='o', c='red', s=30, label="zbot")
plt.title("Winwebsec vs. Zbot LSTM Prediction Scatter Plot",
          fontsize=18, wrap=True)
f.axes.get_xaxis().set_visible(False)
plt.ylabel("Prediction", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=12)

### Make ROC Curve

In [None]:
def sortByFirstItem(item):
    return item[0]

In [None]:
winwebsecROC = [(data, "winwebsec") for data in winwebsecY]
zbotROC = [(data, "zbot") for data in zbotY]

zbotROC.sort(key=sortByFirstItem)
winwebsecROC.sort(key=sortByFirstItem)

dataROC = zbotROC + winwebsecROC
dataROC.sort(key=sortByFirstItem, reverse=True)

In [None]:
def calculate_TPR_FPR(thresholdLine, dataROC):
    TP = 0
    FN = 0
    TN = 0
    FP = 0

    for data in dataROC:
        yVal = data[0]
        family = data[1]

        if family == "winwebsec":
            if yVal < thresholdLine:
                TP += 1
            else:
                FN += 1
        elif family == "zbot":
            if yVal > thresholdLine:
                TN += 1
            else:
                FP += 1
    TPR = TP/(TP+FN)
    FPR = 1 - (TN/(TN+FP))

    return TPR, FPR

In [None]:
def calculateAUC(rocData):
    sum = 0

    # initialization
    prevX = -1
    prevY = -1

    for points in rocData:
        curX = points[0]
        curY = points[1]

        # Skip for first point
        if prevX != -1 and prevY != -1:
            # check if rectangle
            if prevY == curY:
                sum += abs(curX - prevX) * prevY
            # check if trapezoid
            else:
                sum += (curY + prevY) * abs(curX - prevX) * 0.5

        prevX = curX
        prevY = curY

    return sum

In [None]:
rocX = list()  # used to plot
rocY = list()  # used to plot
rocData = list()    # used to calculate AUC

for entry in dataROC:
    thresholdLine = entry[0]
    TPR, FPR = calculate_TPR_FPR(thresholdLine, dataROC)

    rocX.append(FPR)
    rocY.append(TPR)
    rocData.append([FPR, TPR])

rocData.sort(key=lambda item: (item[0], item[1]), reverse=True)

AUC = round(calculateAUC(rocData), 3)

In [None]:
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.figure(200)
plt.plot(rocX, rocY, marker=".", markersize=8)
plt.title("Winwebsec vs. Zbot LSTM Log Probability ROC", fontsize=18)
plt.xlabel("FPR", fontsize=15)
plt.ylabel("TPR", fontsize=15)
plt.grid()
plt.text(x=0.75, y=0, s="AUC: {0}".format(AUC), fontsize=14, bbox=props)

# show plots
plt.show()

In [None]:
if shutdown:
    os.system('shutdown -s')