In [9]:
import sys
sys.path.insert(1, '..\\..\\')

import os
import data_loader
from numpy import trapz
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import win32api

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, Embedding, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

### Tensorflow Version

In [10]:
print(tf.__version__)

2.3.1


### Global Variables

In [11]:
malware_data_dir = '../../data/'
saved_model_path = 'saved_model/'
# opcode_to_int_path = "opcodeToInt.txt"
results_path = "results.txt"
num_unique_opcodes = 30
max_opcode_sequence_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.3
batch_size = 32
num_epochs = 100
test_size= 0.15       # reserve for testing
# num_families_to_use = 20

shutdown = False

In [12]:
 def split_data(train_data_raw, train_labels_raw):
    # Split into training and testing data
    train_data, test_data, train_labels, test_labels = train_test_split(train_data_raw, train_labels_raw, test_size=test_size)

    # Make divisible by batch size
    num_data_train = int(len(train_data)/batch_size) * batch_size
    num_data_test = int(len(test_data)/batch_size) * batch_size

    train_data = train_data[:num_data_train]
    train_labels = train_labels[:num_data_train]
    test_data = test_data[:num_data_test]
    test_labels = test_labels[:num_data_test]

#     print("train_data shape: {}".format(train_data.shape))
#     print("test_data shape: {}".format(test_data.shape))
#     print("train_labels shape: {}".format(train_labels.shape))
#     print("test_labels shape: {}".format(test_labels.shape))

    return train_data, test_data, train_labels, test_labels

In [13]:
def create_model(num_families_to_use):
    model = Sequential()
    model.add(Input(batch_shape=(batch_size, max_opcode_sequence_length), name="input"))
    model.add(Embedding(input_dim=num_unique_opcodes+1,
                        output_dim=embed_vector_length,
                        input_length=max_opcode_sequence_length, name="embedding"))
    model.add(Dropout(dropout_amt, name="dropout1"))
    model.add(Bidirectional(LSTM(num_lstm_unit), input_shape=(None, max_opcode_sequence_length)))
    model.add(Dropout(dropout_amt, name="dropout2"))
    model.add(Dense(num_families_to_use, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [14]:
def run_model():
    num_families_to_use_list = [20]
    results = {}
    
    for num_families_to_use in num_families_to_use_list:
        print("{0} families....".format(num_families_to_use))
        
        with open(results_path, 'a') as file:
            file.write(str(num_families_to_use) + "\n")
            
        opcode_to_int_path = "opcodeToInt_" + str(num_families_to_use) + ".txt"
        # Get train data
        raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                                  num_families_to_use, 
                                                  num_unique_opcodes, 
                                                  max_opcode_sequence_length, 
                                                  opcode_to_int_path)
        # Data preprocessing
        family_names = list(raw_train_data.keys())
        print(family_names)

        # Split opcode family data in individual lists
        train_data = list()
        for family, data in raw_train_data.items():
            train_data.append(data)

        # Pad training data to ensure uniformity
        padded_train_data = list()
        for family_opcodes in train_data:
            padded_sequence = pad_sequences(family_opcodes, 
                                            maxlen=max_opcode_sequence_length)
            padded_train_data.append(padded_sequence)

        # Concatenate all training data into 1 long list instead of multiple lists
        train_data_raw = np.concatenate(padded_train_data)

        print(len(train_data))
        
        # Make labels
        train_labels = []
        for count, data in enumerate(padded_train_data):
            labels_list = np.full(shape=(len(data)), fill_value=count)
            train_labels.append(labels_list)

        train_labels_raw = np.concatenate(train_labels)
        
        train_data_raw = train_data_raw.reshape(len(train_data_raw), max_opcode_sequence_length, 1)
        train_labels_raw = train_labels_raw.reshape(len(train_data_raw), 1, 1)
        
        for i in range(1):
            # get train and test data
            train_data, test_data, train_labels, test_labels = split_data(train_data_raw, train_labels_raw)

            # train model
            model_train = create_model(num_families_to_use)
            early_stopping = EarlyStopping(monitor='loss', 
                                           verbose=1, 
                                           patience=2,
                                           restore_best_weights=True,
                                           min_delta=0.03)
            history = model_train.fit(x=train_data,
                                      y=train_labels,
                                      batch_size=batch_size,
                                      callbacks = [early_stopping],
                                      epochs=num_epochs,
                                      shuffle=True)
            
            model_train.save_weights(saved_model_path) 

            # evaluate
            model_evaluate = create_model(num_families_to_use)
            model_evaluate.set_weights(model_train.get_weights())

            scores = model_evaluate.evaluate(test_data, test_labels, verbose=0, callbacks = [early_stopping])
            accuracy = scores[1]*100
            print("{0}: {1}".format(num_families_to_use, accuracy))
            results[num_families_to_use] = accuracy

#             with open(results_path, 'a') as file:
#                 file.write(str(accuracy) + "\n")

In [7]:
history = model_train.fit(x=train_data,
                          y=train_labels,
                          batch_size=batch_size,
                          callbacks = [early_stopping],
                          epochs=num_epochs,
                          shuffle=True)

model_train.save_weights(saved_model_path) 

NameError: name 'model_train' is not defined

In [15]:
run_model()

20 families....
Getting list of paths to training data
{'winwebsec': 6862260, 'vundo': 3492760, 'zbot': 3256944, 'hotbar': 2952000, 'renos': 2612858, 'onlinegames': 2554166, 'obfuscator': 2502965, 'bho': 2315982, 'alureon': 2287866, 'zeroaccess': 2238000, 'delfinject': 2167855, 'startpage': 2164599, 'adload': 2088000, 'fakerean': 2082110, 'cycbot': 2058000, 'vobfus': 1848000, 'lolyda': 1830000, 'ceeinject': 1725371, 'agent': 1625496, 'rbot': 1623452}
Loading training data for hotbar
1476
Loading training data for renos
1309
Loading training data for vundo
1784
Loading training data for winwebsec
3651
Loading training data for zbot
1785
Loading training data for alureon
1325
Loading training data for bho
1159
Loading training data for obfuscator
1310
Loading training data for onlinegames
1284
Loading training data for zeroaccess
1119
Loading training data for adload
1044
Loading training data for cycbot
1029
Loading training data for delfinject
1090
Loading training data for fakerean
10

In [18]:
# evaluate
model_evaluate = create_model(20)
model_evaluate.load_weights(saved_model_path)

scores = model_evaluate.evaluate(test_data, test_labels, verbose=0, callbacks = [early_stopping])
accuracy = scores[1]*100
print("{0}: {1}".format(num_families_to_use, accuracy))
results[num_families_to_use] = accuracy

#             with open(results_path, 'a') as file:
#                 file.write(str(accuracy) + "\n")

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, 2000, 128)           3968      
_________________________________________________________________
dropout1 (Dropout)           (32, 2000, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (32, 32)                  18560     
_________________________________________________________________
dropout2 (Dropout)           (32, 32)                  0         
_________________________________________________________________
dense (Dense)                (32, 20)                  660       
Total params: 23,188
Trainable params: 23,188
Non-trainable params: 0
_________________________________________________________________


NameError: name 'test_data' is not defined

In [16]:
predictions_train_set = model_evaluate.predict_classes(train_data, verbose=1)
predictions_test_set = model_evaluate.predict_classes(test_data, verbose=1)
labels_train_set = train_labels.reshape((-1))
labels_test_set = test_labels.reshape((-1))

predictions = np.concatenate((predictions_train_set, predictions_test_set))
labels = np.concatenate((labels_train_set, labels_test_set))

NameError: name 'model_evaluate' is not defined

In [None]:
mpl.rcParams['figure.dpi'] = 300
matrix = confusion_matrix(y_true=labels,
                            y_pred=predictions)

fig, ax = plot_confusion_matrix(conf_mat=matrix,
                                show_absolute=False,
                                show_normed=True,
                                class_names=family_names,
                                figsize=(12,12))

for item in (ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(10)


ax.set_title("Confusion Matrix for LSTM Model With Embedding and BiLSTM", fontsize=15, fontweight='bold')
ax.set_ylabel("True Family", fontsize=12, fontweight='bold')
ax.set_xlabel("Predicted Family", fontsize=12, fontweight='bold')

plt.show()


In [None]:
os.system('shutdown -s -t 0')

### Load the Data

In [None]:
raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                          num_families_to_use, 
                                          num_unique_opcodes, 
                                          max_opcode_sequence_length, 
                                          opcode_to_int_path)

### Data preprocessing

In [None]:
family_names = list(raw_train_data.keys())
print(family_names)

# Split opcode family data in individual lists
train_data = list()
for family, data in raw_train_data.items():
    train_data.append(data)
    
# Pad training data to ensure uniformity
padded_train_data = list()
for family_opcodes in train_data:
    padded_sequence = pad_sequences(family_opcodes, 
                                    maxlen=max_opcode_sequence_length)
    padded_train_data.append(padded_sequence)
    
# Concatenate all training data into 1 long list instead of multiple lists
train_data_raw = np.concatenate(padded_train_data)

print(len(train_data))

### Make the labels

In [None]:
train_labels = []

for count, data in enumerate(padded_train_data):
    labels_list = np.full(shape=(len(data)), fill_value=count)
    train_labels.append(labels_list)

train_labels_raw = np.concatenate(train_labels)

### Split into training and testing sets

In [None]:
def split_data(train_data_raw, train_labels_raw):
    # Split into training and testing data
    train_data, test_data, train_labels, test_labels = train_test_split(train_data_raw, train_labels_raw, test_size=test_size)

    # Make divisible by batch size
    num_data_train = int(len(train_data)/batch_size) * batch_size
    num_data_test = int(len(test_data)/batch_size) * batch_size

    train_data = train_data[:num_data_train]
    train_labels = train_labels[:num_data_train]
    test_data = test_data[:num_data_test]
    test_labels = test_labels[:num_data_test]
    
    print("train_data shape: {}".format(train_data.shape))
    print("test_data shape: {}".format(test_data.shape))
    print("train_labels shape: {}".format(train_labels.shape))
    print("test_labels shape: {}".format(test_labels.shape))
    
    return train_data, test_data, train_labels, test_labels

### Make the Model

In [None]:
def create_model():
    model = Sequential()
    model.add(Input(batch_shape=(batch_size, max_opcode_sequence_length), name="input"))
    model.add(Embedding(input_dim=num_unique_opcodes+1,
                        output_dim=embed_vector_length,
                        input_length=max_opcode_sequence_length, name="embedding"))
    model.add(Dropout(dropout_amt, name="dropout1"))
    model.add(Conv1D(filters=embed_vector_length, kernel_size=3, padding='same', activation='relu'))
    model.add(Dropout(dropout_amt, name="dropout2"))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(dropout_amt, name="dropout3"))
    model.add(Bidirectional(LSTM(num_lstm_unit), input_shape=(None, max_opcode_sequence_length)))
    model.add(Dropout(dropout_amt, name="dropout4"))
    model.add(Dense(num_families_to_use, activation='softmax', name="dense"))
    optimizer = Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.summary()
    
    return model

In [None]:
results = []

for i in range(4):
    # get train and test data
    train_data, test_data, train_labels, test_labels = split_data(train_data_raw, train_labels_raw)
    
    # train model
    model_train = create_model()
    early_stopping = EarlyStopping(monitor='loss', 
                                   verbose=1, 
                                   patience=2,
                                   restore_best_weights=True,
                                   min_delta=0.001)
    history = model_train.fit(x=train_data,
                              y=train_labels,
                              batch_size=batch_size,
                              callbacks = [early_stopping],
                              epochs=num_epochs,
                              shuffle=True)
    
    # evaluate
    model_evaluate = create_model()
    model_evaluate.set_weights(model_train.get_weights())

    scores = model_evaluate.evaluate(test_data, test_labels, verbose=0, callbacks = [early_stopping])
    accuracy = scores[1]*100
    print(accuracy)
    results.append(accuracy)

In [None]:
for x in results:
    print(x)
    
win32api.MessageBox(0, 'done', 'title', 0x00001000) 


### Use History to plot and accuracy throughout training

In [None]:
history.history

### Evaluate Model

In [None]:
model_evaluate = create_model()
model_evaluate.set_weights(model_train.get_weights())

scores = model_evaluate.evaluate(test_set, test_labels, verbose=0, callbacks=[callback])
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

### Load Model From Save and Evaluate

In [None]:
model = create_model()
model.load_weights(saved_model_path)



In [None]:
scores = model.evaluate(test_set, test_labels, verbose=0)
accuracy = scores[1]*100
print("Accuracy: %0.2f%%" % (scores[1]*100))

## Make ROC Curve

### Make scatter plot

In [None]:
# separate winwebsec and zbot test data
winwebsec_test_data = []
zbot_test_data = []

for i in range(len(test_labels)):
    if test_labels[i] == 0:
        winwebsec_test_data.append(test_set[i])
    else:
        zbot_test_data.append(test_set[i])
        
winwebsec_test_data = np.asarray(winwebsec_test_data[:192])
zbot_test_data = np.asarray(zbot_test_data[:128])

In [None]:
print(winwebsec_test_data.shape)
print(zbot_test_data.shape)


winwebsecY = model.predict(winwebsec_test_data)
winwebsecX = [i+1 for i in range(len(winwebsec_test_data))]

zbotY = model.predict(zbot_test_data)
zbotX = [i+1 for i in range(len(zbot_test_data))]

In [None]:
plt.figure(100)
f = plt.scatter(winwebsecX, winwebsecY, marker='o',
                c='darkblue', s=30, label="winwebsec")
plt.scatter(zbotX, zbotY, marker='o', c='red', s=30, label="zbot")
plt.title("Winwebsec vs. Zbot LSTM Prediction Scatter Plot",
          fontsize=18, wrap=True)
f.axes.get_xaxis().set_visible(False)
plt.ylabel("Prediction", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=12)

### Make ROC Curve

In [None]:
def sortByFirstItem(item):
    return item[0]

In [None]:
winwebsecROC = [(data, "winwebsec") for data in winwebsecY]
zbotROC = [(data, "zbot") for data in zbotY]

zbotROC.sort(key=sortByFirstItem)
winwebsecROC.sort(key=sortByFirstItem)

dataROC = zbotROC + winwebsecROC
dataROC.sort(key=sortByFirstItem, reverse=True)

In [None]:
def calculate_TPR_FPR(thresholdLine, dataROC):
    TP = 0
    FN = 0
    TN = 0
    FP = 0

    for data in dataROC:
        yVal = data[0]
        family = data[1]

        if family == "winwebsec":
            if yVal < thresholdLine:
                TP += 1
            else:
                FN += 1
        elif family == "zbot":
            if yVal > thresholdLine:
                TN += 1
            else:
                FP += 1
    TPR = TP/(TP+FN)
    FPR = 1 - (TN/(TN+FP))

    return TPR, FPR

In [None]:
def calculateAUC(rocData):
    sum = 0

    # initialization
    prevX = -1
    prevY = -1

    for points in rocData:
        curX = points[0]
        curY = points[1]

        # Skip for first point
        if prevX != -1 and prevY != -1:
            # check if rectangle
            if prevY == curY:
                sum += abs(curX - prevX) * prevY
            # check if trapezoid
            else:
                sum += (curY + prevY) * abs(curX - prevX) * 0.5

        prevX = curX
        prevY = curY

    return sum

In [None]:
rocX = list()  # used to plot
rocY = list()  # used to plot
rocData = list()    # used to calculate AUC

for entry in dataROC:
    thresholdLine = entry[0]
    TPR, FPR = calculate_TPR_FPR(thresholdLine, dataROC)

    rocX.append(FPR)
    rocY.append(TPR)
    rocData.append([FPR, TPR])

rocData.sort(key=lambda item: (item[0], item[1]), reverse=True)

AUC = round(calculateAUC(rocData), 3)

In [None]:
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

plt.figure(200)
plt.plot(rocX, rocY, marker=".", markersize=8)
plt.title("Winwebsec vs. Zbot LSTM Log Probability ROC", fontsize=18)
plt.xlabel("FPR", fontsize=15)
plt.ylabel("TPR", fontsize=15)
plt.grid()
plt.text(x=0.75, y=0, s="AUC: {0}".format(AUC), fontsize=14, bbox=props)

# show plots
plt.show()

In [None]:
if shutdown:
    os.system('shutdown -s')