This taken from chapter 3 of the book [Machine learning for cybersecurity cookbook](https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/tree/master/Chapter03/)

Using Generative Adversarial Networks (GANs), we can create adversarial malware
samples to train and improve our detection methodology, as well as to identify gaps before
an adversary does. The code here is based on j40903272/MalConv-keras. The adversarial 
malware samples are malware samples that have been modified by padding them with a
small, but carefully calculated, sequence of bytes, selected so as to fool the neural network
(in this case, MalConv) being used to classify the samples.

Getting the input from their github repository.

In [27]:
!wget https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter03/MalGan/MalGAN_input/samplesIn.csv

--2021-05-09 06:04:29--  https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter03/MalGan/MalGAN_input/samplesIn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1700 (1.7K) [text/plain]
Saving to: ‘samplesIn.csv.1’


2021-05-09 06:04:29 (23.6 MB/s) - ‘samplesIn.csv.1’ saved [1700/1700]



In [28]:
import pandas as pd

data = pd.read_csv('samplesIn.csv')

In [29]:
data.head()

Unnamed: 0,0778a070b283d5f4057aeb3b42d58b82ed20e4eb_f205bd9628ff8dd7d99771f13422a665a70bb916,0
0,fbd1a4b23eff620c1a36f7c9d48590d2fccda4c2_cc822...,0
1,c095da034535f15a27c073dce54212a28e1af683_8e864...,0
2,488e5eea345e24440f7d0d2a32fbafda314ee6ca_df473...,0
3,7a359bcc1c7ac5f18eff7c3459dadefa9f9e4610_3b7ac...,0
4,509038aad80431b8aa0c9b29bfce07fe7134fc7a_263fb...,0


Lets start by importing code for MalGAN 

### MalGAN_preprocess.py

In [30]:
import os
import time
import pickle
import argparse
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

In [31]:
parser = argparse.ArgumentParser(description='Malconv-keras classifier')
parser.add_argument('--max_len', type=int, default=200000)
parser.add_argument('--save_path', type=str, default='../saved/preprocess_data.pkl')
parser.add_argument('csv', type=str)

_StoreAction(option_strings=[], dest='csv', nargs=None, const=None, default=None, type=<class 'str'>, choices=None, help=None, metavar=None)

In [32]:
def preprocess(fn_list, max_len):
    corpus = []
    for fn in fn_list:
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                corpus.append(f.read())
    
    corpus = [[byte for byte in doc] for doc in corpus]
    len_list = [len(doc) for doc in corpus]
    seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post')
    return seq, len_list

In [33]:
def call_preprocess(name, maxlen=200000, savepath='../saved/preprocess_data.pkl' ):
    df = pd.read_csv(name, header=None)
    fn_list = df[0].values
    
    print('preprocessing... we might be a while here')
    st = time.time()
    processed_data = preprocess(fn_list, maxlen)[0]
    print('Finished ... %d sec' % int(time.time()-st))
    
    with open(savepath, 'wb') as f:
        pickle.dump(preprocessed_data, f)
    print('Preprocessed data store in ', )

### MalGAN_utils.py

In [34]:
import numpy as np
import tensorflow as tf

print(tf.__version__)

2.4.1


In [35]:
def fgsm(model, inp, pad_idx, pad_len, e, step_size=0.001):
    adv = inp.copy()
    loss = K.mean(model.output[:,0])
    grads = K.gradients(loss, model.layers[1].output)[0]
    grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-8)
    
    mask = np.zeros(model.layers[1].output.shape[1:])
    mask[pad_idx:pad_idx + pad_len] = 1
    grads *= K.constant(mask)
    
    iterate = K.function([model.layers[1].output], [loss, grads])
    g = 0.
    step = int(1/step_size) * 10
    for _ in range(step):
        loss_value, grads_value = iterate([adv])
        grads_value *= step_size
        g += grads_value
        adv += grads_value
        
        if loss_vale >= 0.9:
            break
    return adv, h, loss_value

In [36]:
def limit_gpu_memory(per):
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = per
    set_session(tf.Session(config=config))

In [37]:
def train_test_split(data, label, val_size=0.1):
    idx = np.arange(len(data))
    np.random.shuffle(idx)
    split = int(len(data)*val_size)
    x_train, x_test = data[idx[split:]], data[idx[:split]]
    y_train, y_test = label[idx[split:]], label[idx[:split]]
    return x_train, x_test, y_train, y_test

In [38]:
def data_generator(data, labels, max_len=200000, batch_size=64, shuffle=True):
    idx = np.arange(len(data))
    if shuffle:
        np.random.shuffle(idx)
    batches = [idx[range(batch_size*i, min(len(data), batch_size*(i+1)))] for i in range(len(data)//batch_size+1)]
    while True:
        for i in batches:
            xx = preprocess(data[i], max_len)[0]
            yy = labels[i]
            yield (xx, yy)

In [39]:
class logger():
    def __init__(self):
        self.fn = []
        self.len = []
        self.pad_len = []
        self.loss = []
        self.pred = []
        self.org = []
    def write(self, fn, org_score, file_len, pad_len, loss, pred):
        self.fn.append(fn.split('/')[-1])
        self.org.append(org_score)
        self.len.append(file_len)
        self.pad_len.append(pad_len)
        self.loss.append(loss)
        self.pred.append(pred)
        
        print('\nFILE:', fn)
        if pad_len > 0:
            print('\tfile length:', file_len)
            print('\tpad length:', pad_len)
            #if not np.isnan(loss):
            print('\tloss:', loss)
            print('\tscore:', pred)
        else:
            print('\tfile length:', file_len, ', Exceed max length ! Ignored !')
        print('\toriginal score:', org_score)
        
    def save(self, path):
        d = {'filename':self.fn, 
             'original score':self.org, 
             'file length':self.len,
             'pad length':self.pad_len, 
             'loss':self.loss, 
             'predict score':self.pred}
        df = pd.DataFrame(data=d)
        df.to_csv(path, index=False, columns=['filename', 'original score', 
                                              'file length', 'pad length', 
                                              'loss', 'predict score'])
        print('\nLog saved to "%s"\n' % path)

### MalGAN_gen_adv_examples.py

In [40]:
from sklearn.neighbors import NearestNeighbors
from keras import backend as K
import numpy as np

In [45]:
def gen_adv_samples(model, fn_list, pad_percent=0.1, step_size=0.001, thres=0.5):
    def emb_search(org, adv, pad_idx, pad_len, neigh):
        out = org.copy()
        for idx in range(pad_idx, pad_idx + pad_len):
            target = adv[idx].reshape(1,-1)
            best_idx = neigh.kneighbors(target, 1, False)[0][0]
            out[0][idx] = best_idx
        return out
    
    max_len = int(model.input.shape[1])
    emb_layer = model.layers[1]
    emb_weight = emb_layer.get_weights()[0]
    inp2emb = K.function([model.input]+ [K.learning_phase()], [emb_layer.output]) # [function] Map sequence to embedding

    # Build neighbor searches
    neigh = NearestNeighbors(1)
    neigh.fit(emb_weight)

    log = logger()
    adv_samples = []
    
    for e, fn in enumerate(fn_list):

        ###   run one file at a time due to different padding length, [slow]
        inp, len_list = call_preprocess([fn], max_len)
        inp_emb = np.squeeze(np.array(inp2emb([inp, False])), 0)

        pad_idx = len_list[0]
        pad_len = max(min(int(len_list[0]*pad_percent), max_len-pad_idx), 0)
        org_score = model.predict(inp)[0][0]    ### origianl score, 0 -> malicious, 1 -> benign
        loss, pred = float('nan'), float('nan')

        if pad_len > 0:

            if org_score < thres:
                adv_emb, gradient, loss = fgsm(model, inp_emb, pad_idx, pad_len, e, step_size)
                adv = emb_search(inp, adv_emb[0], pad_idx, pad_len, neigh)
                pred = model.predict(adv)[0][0]
                final_adv = adv[0][:pad_idx+pad_len]

            else: # use origin file
                final_adv = inp[0][:pad_idx]


        log.write(fn, org_score, pad_idx, pad_len, loss, pred)

        # sequence to bytes
        bin_adv = bytes(list(final_adv))
        adv_samples.append(bin_adv)

    return adv_samples, log
    
    
    

### creating GANs

In [42]:
import os
import pandas as pd
from keras.models import load_model


Specifying the input and output path

In [43]:
!wget https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/MalGan/MalGAN_input/malconv.h5

--2021-05-09 06:04:30--  https://github.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/raw/master/Chapter03/MalGan/MalGAN_input/malconv.h5
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter03/MalGan/MalGAN_input/malconv.h5 [following]
--2021-05-09 06:04:31--  https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-for-Cybersecurity-Cookbook/master/Chapter03/MalGan/MalGAN_input/malconv.h5
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12459832 (12M) [application/octet-stream]
Saving to: ‘malconv.h

In [49]:
save_path = "MalGAN_output"
model_path = "malconv.h5"
log_path = "MalGAN_output/adversarial_log.csv"
pad_percent = 0.1
threshold = 0.6
step_size = 0.01
limit = 0.
input_samples = "samplesIn.csv"

df = pd.read_csv(input_samples, header=None)
fn_list = df[0].values
model = load_model(model_path)

# print(gen_adv_samples(model, fn_list, pad_percent,step_size, threshold))
# adv_samples, log= gen_adv_samples(model, fn_list, pad_percent,step_size, threshold)

# log.save(log_path)

# for fn, adv in zip(fn_list, adv_samples):
#     _fn = fn.split()
#     dst = os.path.join(save_path, _fn)
#     print(dst)
#     with open(dst, 'wb') as f:
#         f.write(adv)