In [1]:
import argparse
import logging
import os
import string
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.layers import Conv1D, Dropout, concatenate, LSTM, RepeatVector, Dense, TimeDistributed, \
    LeakyReLU, BatchNormalization, AveragePooling1D, MaxPooling1D,Lambda, ReLU, Flatten, Reshape, Softmax, \
    Activation, Embedding
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.utils import Progbar
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from jiwer import wer
from pathlib import Path
import json
from datasets import load_metric
metric = load_metric("wer")


tf.config.run_functions_eagerly(True)
print(tf.__version__)

2.4.1


In [2]:
def save_model(model,fileModelJSON,fileWeights):
    print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)

In [3]:
def load_model(fileModelJSON,fileWeights):
    with open(fileModelJSON, 'r') as f:
            model_json = json.load(f)
            model=model_from_json(model_json)
    model.load_weights(fileWeights)
    return model

In [4]:
df5 = pd.read_csv('../data/alexa_domains.txt',names = ['url','IsMalicious'],header = None, sep = " ")

df = pd.DataFrame()
df = df.append([df5], ignore_index=True)
df = df.sample(frac=1, replace=True, random_state=100)
df.shape

(1000000, 2)

In [5]:
#df = df.loc[df['url'].str.len() > 5]
maxlen= 20
X_ = df['url'].values
tk = Tokenizer(char_level=True)
tk.fit_on_texts(string.ascii_lowercase + string.digits + '-' + '.')
seq = tk.texts_to_sequences(X_)
X = sequence.pad_sequences(seq, maxlen=maxlen)
inv_map = {v: k for k, v in tk.word_index.items()}
X_tmp = []
for x in X:
    X_tmp.append(to_categorical(x,39))
b =tk.document_count
X = np.array(X_tmp)
c = X[int(X.shape[0] * 0.1):, :, :]
data_dict =  {'X_train': X[int(X.shape[0] * 0.1):, :, :],
            "X_test": X[:int(X.shape[0] * 0.1), :, :],
            "word_index": tk.document_count,
            "inv_map": inv_map,
            "legit_domain":X_}

In [6]:
def __np_sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array

    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
def __to_readable_domain(decoded, inv_map):
    domains = []
    for j in range(decoded.shape[0]):
        word = ""
        for i in range(decoded.shape[1]):
            if decoded[j][i] != 0:
                word = word + inv_map[decoded[j][i]]
        domains.append(word)
    return domains

In [8]:
def tokenize(df):
    tk = Tokenizer(char_level=True)
    tk.fit_on_texts(string.ascii_lowercase + string.digits + '-' + '.')
    seq = tk.texts_to_sequences(df)
    X = sequence.pad_sequences(seq, maxlen=20)
    return X

In [9]:
def detokenize(gen_dom):
    sampled = []
    for x in gen_dom:
        word = []
        for y in x:
            word.append(__np_sample(y))
        sampled.append(word)
    readable = __to_readable_domain(np.array(sampled), inv_map=data_dict['inv_map'])
    return readable

In [10]:
def encoder_model():
    cnn_filters = [256, 256, 256, 8]
    cnn_kernels = [2, 3, 4, 2]
    cnn_strides = [1, 1, 1, 1]
    en_convs = []
    
    inputs = Input(shape=(20,39,),name="Encoder_Input")
    #encoder = Embedding(1000, 39,input_length=20)(inputs)
    for i in range(3):
        conv = Conv1D(cnn_filters[i],
                      cnn_kernels[i],
                      padding='same',
                      strides=cnn_strides[i],
                      name='en_conv%s' % i)(inputs)
        conv = ReLU()(conv)
        en_convs.append(conv)

    encoder = concatenate(en_convs)
    encoder = Conv1D(cnn_filters[3],
                      cnn_kernels[3],
                      padding='same',
                      strides=cnn_strides[3],
                      name='en_conv%s' % 3)(encoder)
    encoder = ReLU()(encoder)
    encoder = Flatten()(encoder)
    
    model = Model(inputs=inputs, outputs=encoder, name='Encoder')
    return model

end = encoder_model()
end.summary()

Model: "Encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Input (InputLayer)      [(None, 20, 39)]     0                                            
__________________________________________________________________________________________________
en_conv0 (Conv1D)               (None, 20, 256)      20224       Encoder_Input[0][0]              
__________________________________________________________________________________________________
en_conv1 (Conv1D)               (None, 20, 256)      30208       Encoder_Input[0][0]              
__________________________________________________________________________________________________
en_conv2 (Conv1D)               (None, 20, 256)      40192       Encoder_Input[0][0]              
____________________________________________________________________________________________

In [11]:
def decoder_model(latent_vector=160):
    cnn_filters = [256, 256, 256, 32, 39]
    cnn_kernels = [2, 3, 4, 3, 3]
    cnn_strides = [1, 1, 1, 1, 1]
    dec_convs = []
    dece =int(latent_vector/20)
    word_index = 20

    inputs = Input(shape=(latent_vector),name="Decoder_Input")
    decoder = Reshape([word_index,dece],input_shape = (latent_vector,))(inputs)
    for i in range(3):
        conv = Conv1D(cnn_filters[i],
                      cnn_kernels[i],
                      padding='same',
                      strides=cnn_strides[i],
                      name='dec_conv%s' % i)(decoder)
        conv = ReLU()(conv)
        dec_convs.append(conv)

    decoder = concatenate(dec_convs)
    decoder = Conv1D(cnn_filters[3],
                      cnn_kernels[3],
                      padding='same',
                      strides=cnn_strides[3],
                      name='dec_conv%s' % 3)(decoder)
    decoder = ReLU()(decoder)
    decoder = Conv1D(cnn_filters[4],
                      cnn_kernels[4],
                      padding='same',
                      strides=cnn_strides[4],
                      name='dec_conv%s' % 4)(decoder)
    decoder = Softmax()(decoder)
    #decoder = Flatten()(decoder)
    #decoder = Dense(word_index)(decoder)
    model = Model(inputs=inputs, outputs=decoder, name='Decoder')
    return model

decd = decoder_model()
decd.summary()

Model: "Decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_Input (InputLayer)      [(None, 160)]        0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (None, 20, 8)        0           Decoder_Input[0][0]              
__________________________________________________________________________________________________
dec_conv0 (Conv1D)              (None, 20, 256)      4352        reshape[0][0]                    
__________________________________________________________________________________________________
dec_conv1 (Conv1D)              (None, 20, 256)      6400        reshape[0][0]                    
____________________________________________________________________________________________

In [12]:
def generator_model():
    """
    Generator model:
    param: noise vector
    :return: generator model
    """
    model = Sequential()
    model.add(Input(shape=(20,)))
    model.add(Dense(480, activation='relu'))
    model.add(ReLU())
    model.add(decoder_model(480)) 
    return model
              
genr = generator_model()
genr.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 480)               10080     
_________________________________________________________________
re_lu_8 (ReLU)               (None, 480)               0         
_________________________________________________________________
Decoder (Functional)         (None, 20, 39)            133607    
Total params: 143,687
Trainable params: 143,687
Non-trainable params: 0
_________________________________________________________________


In [13]:
def discriminator_model():
    """
    Discriminator model:
    :return: Discriminator model
    """
    model = Sequential()
    model.add(encoder_model())
    model.add(Dense(1, activation='relu'))
    model.add(Activation('relu'))
    return model
              
disc = discriminator_model()
disc.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder (Functional)         (None, 160)               102920    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
_________________________________________________________________
activation (Activation)      (None, 1)                 0         
Total params: 103,081
Trainable params: 103,081
Non-trainable params: 0
_________________________________________________________________


In [14]:
def adversarial(g, d):
    """
    Adversarial Model
    :return: Adversarial model
    """
    adv_model = Sequential()
    adv_model.add(g)
    d.trainable = False
    adv_model.add(d)
    return adv_model

In [15]:
disc = discriminator_model()
genr = generator_model()
gan = adversarial(genr, disc)

In [16]:
discr_opt = SGD(
        lr=0.00001,
        clipvalue=1.0,
        decay=1e-8)
gan_opt = Adam(
        lr=0.000001,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-8,
        decay=1e-8,
        clipvalue=1.0)

In [17]:
gan.compile(loss='binary_crossentropy', optimizer=discr_opt, metrics=["accuracy"])
disc.trainable = True
disc.compile(loss='binary_crossentropy', optimizer=gan_opt, metrics=["accuracy"])

In [18]:
EPOCH = 5
n=20
e = encoder_model()
d = decoder_model()

adv_model = Sequential()
adv_model.add(e)
adv_model.add(d)
print(adv_model.summary())

train,test = train_test_split(df, test_size=0.1)
train_new,test_new = tokenize(train), tokenize(test)
train_new, test_new= data_dict['X_train'],data_dict['X_test']
print(train_new.shape,test_new.shape)


adv_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
adv_model.fit(train_new, train_new,
             verbose=1,
             validation_data = (test_new, test_new),
             batch_size=128,
             epochs=EPOCH)
loss, accuracy = adv_model.evaluate(test_new, test_new, verbose=1)
print("Loss:",loss,"Accuracy:",accuracy)

model_name = "Autoencodermodel"
MODEL_HOME = "../model/GAN_Models/"
save_model(adv_model,MODEL_HOME + model_name + ".json", MODEL_HOME + model_name + ".h5")

print("testing")
predictions = adv_model.predict(test_new, verbose=1)
sampled = []
for x in predictions:
    word = []
    for y in x:
        word.append(__np_sample(y))
    sampled.append(word)

print("results")
readable = __to_readable_domain(np.array(sampled), inv_map=data_dict['inv_map'])
dfa= df['url'].tolist()
print("WER: {:2f}".format(100 * wer(hypothesis=readable[:10], truth=dfa[:10])))
print(dfa[:10])
print(readable[:10])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder (Functional)         (None, 160)               102920    
_________________________________________________________________
Decoder (Functional)         (None, 20, 39)            96743     
Total params: 199,663
Trainable params: 199,663
Non-trainable params: 0
_________________________________________________________________
None
(900000, 20, 39) (100000, 20, 39)


  "Even though the tf.config.experimental_run_functions_eagerly "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.0006740284734405577 Accuracy: 0.9999330043792725
Saving model to disk:  ../model/GAN_Models/Autoencodermodel.json and ../model/GAN_Models/Autoencodermodel.h5
testing


  import sys


results
WER: 10.000000
['vinylpersen.nl', 'missoulian.com', 'fileden.com', 'mythoseditora.com.br', 'cotemosaique.com', 'cost1action.com', 'kolbe.com', 'innogear.net', 'freebirdgames.com', '3dprintersuperstore.com.au']
['vinylpersen.nl', 'missoulian.com', 'fileden.com', 'mythoseditora.com.br', 'cotemosaique.com', 'cost1action.com', 'kolbe.com', 'innogear.net', 'freebirdgames.com', 'tersuperstore.com.au']


In [19]:
BATCH_SIZE = 1000
train = data_dict['X_train']
test = data_dict['X_test']
batch = (int(train.shape[0] / BATCH_SIZE)/10)
EPOCHS = 5

for epoch in range(EPOCHS):
        print("Epoch is %s" % epoch)
        print("Number of batches %s" % int(train.shape[0] / BATCH_SIZE))
        print("Batch size: %s" % BATCH_SIZE)

        for index in range(int(train.shape[0] / BATCH_SIZE)):
            noise = np.random.normal(0, 1,size=(BATCH_SIZE, 20))
            normal_domains = train[(index * BATCH_SIZE):(index + 1) * BATCH_SIZE]
            
            generated_domains = genr.predict(noise, verbose=0)

            labels_size = (BATCH_SIZE, 1)
            
            labels_real = np.random.normal(0, 1, size=labels_size)
            labels_fake = np.zeros(shape=labels_size)

            if index % 2 == 0:
                training_domains = normal_domains
                labels = labels_real
            else:
                training_domains = generated_domains
                labels = labels_fake

            # training discriminator on both Normal and generated domains
        
            disc.trainable = True
            #disc_history = disc.train_on_batch(training_domains, labels,reset_metrics=True,return_dict=True)
            disc_history1 = disc.train_on_batch(normal_domains, labels_real,reset_metrics=True,return_dict=True)
            disc_history2 = disc.train_on_batch(generated_domains, labels_fake,reset_metrics=True,return_dict=True)
            disc_history = np.mean([disc_history1['loss'], disc_history2['loss']])
            disc_acc = np.mean([disc_history1['accuracy'], disc_history2['accuracy']])
            disc_dict = {'loss': disc_history, 'accuracy': disc_acc}
            disc.trainable = False
            
            noise = np.random.normal(0, 1, size=(BATCH_SIZE, 20))  # random latent vectors.
            misleading_targets = np.random.normal(0, 1, size=labels_size)
            gan_history = gan.train_on_batch(noise, misleading_targets,reset_metrics=True,return_dict=True) 
            if (index % 100 == 0):
                print({index:gan_history},{index:disc_dict})
print(gan.summary())
print(genr.summary())
print(disc.summary())

Epoch is 0
Number of batches 900
Batch size: 1000
{0: {'loss': 0.16671010851860046, 'accuracy': 0.0}} {0: {'loss': 0.09856456518173218, 'accuracy': 0.5}}
{100: {'loss': 0.6310011148452759, 'accuracy': 0.0}} {100: {'loss': 0.11298344284296036, 'accuracy': 0.5}}
{200: {'loss': 0.659405529499054, 'accuracy': 0.0}} {200: {'loss': -0.1056501492857933, 'accuracy': 0.5}}
{300: {'loss': -0.3547500669956207, 'accuracy': 0.0}} {300: {'loss': 0.039784058928489685, 'accuracy': 0.5}}
{400: {'loss': -0.3407435417175293, 'accuracy': 0.0}} {400: {'loss': 0.0076449280604720116, 'accuracy': 0.5}}
{500: {'loss': -0.2170599102973938, 'accuracy': 0.0}} {500: {'loss': 0.07997946441173553, 'accuracy': 0.5}}
{600: {'loss': -0.1953703910112381, 'accuracy': 0.0}} {600: {'loss': 0.0767197534441948, 'accuracy': 0.5}}
{700: {'loss': 0.6993337869644165, 'accuracy': 0.0}} {700: {'loss': 0.3392060399055481, 'accuracy': 0.5}}
{800: {'loss': 0.6591306924819946, 'accuracy': 0.0}} {800: {'loss': -0.11989404261112213, 'ac

In [20]:
model_name = "Generatormodel"
MODEL_HOME = "../model/GAN_Models/"
save_model(genr,MODEL_HOME + model_name + ".json", MODEL_HOME + model_name + ".h5")

model_name = "Discriminatormodel"
MODEL_HOME = "../model/GAN_Models/"
save_model(disc,MODEL_HOME + model_name + ".json", MODEL_HOME + model_name + ".h5")

model_name = "GANmodel"
MODEL_HOME = "../model/GAN_Models/"
save_model(gan,MODEL_HOME + model_name + ".json", MODEL_HOME + model_name + ".h5")

Saving model to disk:  ../model/GAN_Models/Generatormodel.json and ../model/GAN_Models/Generatormodel.h5
Saving model to disk:  ../model/GAN_Models/Discriminatormodel.json and ../model/GAN_Models/Discriminatormodel.h5
Saving model to disk:  ../model/GAN_Models/GANmodel.json and ../model/GAN_Models/GANmodel.h5


In [21]:
sampled = []
for x in normal_domains:
    word = []
    for y in x:
        word.append(__np_sample(y))
    sampled.append(word)

print("results")
readablen = __to_readable_domain(np.array(sampled), inv_map=data_dict['inv_map'])
readablen[:10]

  import sys


results


['startupily.com',
 'aznoticias.mx',
 'angelhotel.com.tw',
 'mwt.ru',
 'hku-szh.org',
 'goodmart.com',
 'irobot-jp.com',
 'ruhraktuell.com',
 'raz.ru',
 'asanatlar.com']

In [22]:
sampled = []
for x in generated_domains:
    word = []
    for y in x:
        word.append(__np_sample(y))
    sampled.append(word)

print("results")
readableg = __to_readable_domain(np.array(sampled), inv_map=data_dict['inv_map'])
readableg[:10]

results


['igj5wyyyzmy2zqfsh65v',
 'beta45yaipv4q6q3m3on',
 'jo601dpse76aij3ii.x5',
 'n4sb5zpbggy38gys7p9',
 '852zir35niey2ayh1i2u',
 '-4uhvh8rmwwaehcu9w7l',
 '7a4obtg81sy51lde0v38',
 't4v6752tndh6e9arrrc',
 'h6vs879jyfqxayyynuz',
 'ablf7b3pkz6ju1ab1lwb']