In [None]:
import sys
sys.path.insert(1, '..\\..\\')
import data_loader

# Import our dependencies
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.layers import Input, Lambda, Bidirectional, Dense, Dropout
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np
from sklearn.model_selection import train_test_split

# Initialize session
sess = tf.Session()
K.set_session(sess)

In [None]:
malware_data_dir = '../../data/'
saved_model_path = 'saved_model/'
opcode_to_int_path = "opcodeToInt.txt"
num_unique_opcodes = 30
max_opcode_sequence_length = 2000
embed_vector_length = 128
num_lstm_unit = 16
dropout_amt = 0.3
batch_size = 32
num_epochs = 20
test_size= 0.15       # reserve for testing
num_families_to_use = 5

shutdown = False

In [None]:
raw_train_data = data_loader.getTrainData(malware_data_dir, 
                                          num_families_to_use, 
                                          num_unique_opcodes, 
                                          max_opcode_sequence_length, 
                                          opcode_to_int_path)

In [None]:
family_names = list(raw_train_data.keys())
print(family_names)

# Split opcode family data in individual lists
train_data = list()
for family, data in raw_train_data.items():
    train_data.append(data)
    
# Pad training data to ensure uniformity
padded_train_data = list()
for family_opcodes in train_data:
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(family_opcodes, 
                                    maxlen=max_opcode_sequence_length)
    padded_train_data.append(padded_sequence)
    
# Concatenate all training data into 1 long list instead of multiple lists
train_data_raw = np.concatenate(padded_train_data)

print(len(train_data))

In [None]:
train_labels = []

for count, data in enumerate(padded_train_data):
    labels_list = np.full(shape=(len(data)), fill_value=count)
    train_labels.append(labels_list)

train_labels_raw = np.concatenate(train_labels)

In [None]:
def split_data(train_data_raw, train_labels_raw):
    # Split into training and testing data
    train_data, test_data, train_labels, test_labels = train_test_split(train_data_raw, train_labels_raw, test_size=test_size)

    # Make divisible by batch size
    num_data_train = int(len(train_data)/batch_size) * batch_size
    num_data_test = int(len(test_data)/batch_size) * batch_size

    train_data = train_data[:num_data_train]
    train_labels = train_labels[:num_data_train]
    test_data = test_data[:num_data_test]
    test_labels = test_labels[:num_data_test]
    
    return train_data, test_data, train_labels, test_labels

In [None]:
# Create a custom layer that allows us to update weights (lambda layers do not have trainable parameters!)

class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

#     def compute_mask(self, inputs, mask=None):
#         return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [10]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.layers import Input, Lambda, Bidirectional, Dense, Dropout, LSTM
from tensorflow.keras.models import Model

In [6]:
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [7]:
def ELMoEmbedding(input_text):
    return elmo(tf.reshape(tf.cast(input_text, tf.string), [-1]), signature="default", as_dict=True)["elmo"]

In [15]:
def build_model():
    input_layer = Input(shape=(1,), dtype="string", name="Input_layer")
    embedding_layer = Lambda(ELMoEmbedding, output_shape=(1024, ), name="Elmo_Embedding")(input_layer)
    BiLSTM = Bidirectional(LSTM(1024, return_sequences= False, recurrent_dropout=0.2, dropout=0.2), name="BiLSTM")(embedding_layer)
    output_layer = Dense(1, activation='sigmoid')(BiLSTM)
    model = Model(inputs=[input_layer], outputs=output_layer, name="BiLSTM with ELMo Embeddings")
    model.summary()
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model
elmo_BiDirectional_model = build_model()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "BiLSTM with ELMo Embeddings"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input_layer (InputLayer)     [(None, 1)]               0         
_________________________________________________________________
Elmo_Embedding (Lambda)      (None, None, 1024)        0         
_________________________________________________________________
BiLSTM (Bidirectional)       (None, 2048)              16785408  
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 2049      
Total params: 16,787,457
Trainable params: 16,787,457
Non-trainable params: 0
_________________________________________________________________


In [13]:
model_elmo = elmo_BiDirectional_model.fit(X_train, y_train, epochs=100, batch_size=128)
 #model_elmo.save_weights(‘./model_elmo_neural_network_weights.h5’)

NameError: name 'X_train' is not defined

In [None]:
def create_model():
    input_text = layers.Input(shape=(1,), name="input", dtype="string")
#     embedding = layers.Embedding(input_dim=num_unique_opcodes+1,
#                                  output_dim=embed_vector_length,
#                                  input_length=max_opcode_sequence_length, name="embedding")(input_text)

    elmo = ElmoEmbeddingLayer()
    
    embedding = layers.Lambda(elmo, output_shape=(None, 1024))(input_text)
    
    dense = layers.Bidirectional(layers.LSTM(num_lstm_unit))(embedding)
    
    pred = layers.Dense(5, activation='sigmoid')(dense)

    model = Model(inputs=[input_text], outputs=pred)

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model
    
    
#     model = Sequential()
#     model.add(layers.Input(batch_shape=(batch_size, max_opcode_sequence_length), name="input"))
#     model.add(layers.Embedding(input_dim=num_unique_opcodes+1,
#                                   output_dim=embed_vector_length,
#                                   input_length=max_opcode_sequence_length, name="embedding"))
#     model.add(LSTM(num_lstm_unit, 
#                    input_shape=(None, max_opcode_sequence_length),
#                    name="lstm1"))
#     model.add(Dense(5, activation='softmax', name="dense"))
#     optimizer = Adam()
#     model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#     #model.summary()
    
#     return model

create_model()

In [None]:
results = []

for i in range(5):
    # get train and test data
    train_data, test_data, train_labels, test_labels = split_data(train_data_raw, train_labels_raw)
    
    # train model
    model_train = create_model()
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', verbose=1, patience=2)
    history = model_train.fit(x=train_data[:160],
                          y=train_labels[:160],
                          batch_size=batch_size,
                          callbacks = [early_stopping],
                          epochs=num_epochs,)
    
    # evaluate
    model_evaluate = create_model()
    model_evaluate.set_weights(model_train.get_weights())

    scores = model_evaluate.evaluate(test_data[:64], test_labels[:64], verbose=0)
    accuracy = scores[1]*100
    print(accuracy)
    results.append(accuracy)

In [None]:
model.save('ElmoModel.h5')
pre_save_preds = model.predict(test_text[0:100]) # predictions before we clear and reload model

# Clear and load model
model = None
model = build_model()
model.load_weights('ElmoModel.h5')

post_save_preds = model.predict(test_text[0:100]) # predictions after we clear and reload model
all(pre_save_preds == post_save_preds) # Are they the same?