In [1]:
import sys
sys.path.insert(1, '..\\..\\')
import data_loader

# Import our dependencies
import tensorflow as tf
import tensorflow_hub as hub
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Lambda, Bidirectional, Dense, Dropout, LSTM
from tensorflow.keras.models import Model

In [2]:
malware_data_dir = '../../data/'
saved_model_path = 'saved_model/'
opcode_to_int_path = "opcodeToInt.txt"
num_unique_opcodes = 30
max_opcode_sequence_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.3
batch_size = 32
num_epochs = 20
test_size= 0.15       # reserve for testing
num_families_to_use = 5

shutdown = False

In [3]:
raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                          num_families_to_use, 
                                          num_unique_opcodes, 
                                          max_opcode_sequence_length, 
                                          opcode_to_int_path)

Getting list of paths to training data
{'winwebsec': 6862260, 'vundo': 3492760, 'zbot': 3256944, 'hotbar': 2952000, 'renos': 2612858}
Loading training data for hotbar
Loading training data for renos
Loading training data for vundo
Loading training data for winwebsec
Loading training data for zbot
All training data loaded


In [4]:
family_names = list(raw_train_data.keys())
print(family_names)

# Split opcode family data in individual lists
train_data = list()
for family, data in raw_train_data.items():
    train_data.append(data)
    
# Pad training data to ensure uniformity
padded_train_data = list()
for family_opcodes in train_data:
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(family_opcodes, 
                                    maxlen=max_opcode_sequence_length)
    padded_train_data.append(padded_sequence)
    
# Concatenate all training data into 1 long list instead of multiple lists
train_data_raw = np.concatenate(padded_train_data)

print(len(train_data))

['hotbar', 'renos', 'vundo', 'winwebsec', 'zbot']
5


In [5]:
train_labels = []

for count, data in enumerate(padded_train_data):
    labels_list = np.full(shape=(len(data)), fill_value=count)
    train_labels.append(labels_list)

train_labels_raw = np.concatenate(train_labels)

In [6]:
def split_data(train_data_raw, train_labels_raw):
    # Split into training and testing data
    train_data, test_data, train_labels, test_labels = train_test_split(train_data_raw, train_labels_raw, test_size=test_size)

    # Make divisible by batch size
    num_data_train = int(len(train_data)/batch_size) * batch_size
    num_data_test = int(len(test_data)/batch_size) * batch_size

    train_data = train_data[:num_data_train]
    train_labels = train_labels[:num_data_train]
    test_data = test_data[:num_data_test]
    test_labels = test_labels[:num_data_test]
    
    return train_data, test_data, train_labels, test_labels

In [12]:
train_data, test_data, train_labels, test_labels = split_data(train_data_raw, train_labels_raw)

In [14]:
print("train_data shape: {}".format(train_data.shape))
print("test_data shape: {}".format(test_data.shape))
print("train_labels shape: {}".format(train_labels.shape))
print("test_labels shape: {}".format(test_labels.shape))

train_data shape: (8480, 2000)
test_data shape: (1472, 2000)
train_labels shape: (8480,)
test_labels shape: (1472,)


In [7]:
def ELMoEmbedding(input_text):
    elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
    return elmo(tf.reshape(tf.cast(input_text, tf.string), [-1]), signature="default", as_dict=True)["elmo"]

In [17]:
def build_model():
    input_layer = Input(batch_shape=(batch_size, max_opcode_sequence_length), name="Input_layer")
    embedding_layer = Lambda(ELMoEmbedding, output_shape=(1024, ), name="Elmo_Embedding")(input_layer)
    BiLSTM = Bidirectional(LSTM(num_lstm_unit, return_sequences= False), name="BiLSTM")(embedding_layer)
    output_layer = Dense(num_families_to_use, activation='sigmoid')(BiLSTM)
    model = Model(inputs=[input_layer], outputs=output_layer, name="BiLSTM with ELMo Embeddings")
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

elmo_BiDirectional_model = build_model()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "BiLSTM with ELMo Embeddings"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input_layer (InputLayer)     [(32, 2000)]              0         
_________________________________________________________________
Elmo_Embedding (Lambda)      (None, None, 1024)        0         
_________________________________________________________________
BiLSTM (Bidirectional)       (None, 32)                133248    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 133,413
Trainable params: 133,413
Non-trainable params: 0
_________________________________________________________________


In [9]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', verbose=1, patience=2)

In [18]:
elmo_BiDirectional_model.fit(x=train_data,
                             y=train_labels,
                             batch_size=batch_size,
                             callbacks=[early_stopping],
                             epochs=num_epochs,)

Train on 8480 samples
Epoch 1/20


UnimplementedError: Cast float to string is not supported
	 [[{{node Elmo_Embedding_1/Cast}}]]