In [160]:
import gc
import os
import nltk
import tqdm
import numpy as np
import pandas as pd
import string 
import keras.backend as K
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eshanka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [161]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.engine import Layer
from keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten
from keras.layers import concatenate, GRU, Input, LSTM, MaxPooling1D
from keras.layers import GlobalAveragePooling1D,  GlobalMaxPooling1D, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

Setting up parameter globals

In [162]:
gru_len = 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.3
rate_drop_dense = 0.3

batch_size = 256
recurrent_units = 64
dropout_rate = 0.3 
dense_size = 32
sentences_length = 20
fold_count = 10

Non-linear activation fucntion for capsule layer

In [163]:
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

Generating the capsule network algorithm with slight tweaks to optimize with the dataset


In [164]:
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel', shape=(1, input_dim_capsule, self.num_capsule * self.dim_capsule), initializer='glorot_uniform', trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel', shape=(input_num_capsule, input_dim_capsule, self.num_capsule * self.dim_capsule), initializer='glorot_uniform', trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            #if i < self.routings - 1:
            #    b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [165]:
def get_model(embedding_matrix, sequence_length, dropout_rate, recurrent_units, dense_size):
    inputs = Input(shape=(sequence_length,))
    embededLayer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], batch_size=None, weights=[embedding_matrix],input_length=10 , trainable=True)(inputs)
    embededLayer = SpatialDropout1D(rate_drop_dense)(embededLayer)
    bi = Bidirectional(GRU(gru_len, activation='relu', dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True))(embededLayer)
    capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings, share_weights=True)(bi)
    capsule = Flatten()(capsule)
    capsule = Dropout(dropout_rate)(capsule)
    output = Dense(20, activation='sigmoid')(capsule)
    model = Model(inputs=inputs, outputs=output, name="CapsDGA")
    return model

In [166]:
def _train_model(model, batch_size, train_x, train_y, val_x, val_y):
    num_labels = int(train_y.shape[1])
    patience = 5
    best_loss = -1
    best_weights = None
    best_epoch = 0
    
    current_epoch = 0
    
    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epochs=1)
        y_pred = model.predict(val_x, batch_size=batch_size)

        total_loss = 0
        for j in range(num_labels):
            loss = log_loss(val_y[:, j], y_pred[:, j])
            total_loss += loss

        total_loss /= num_labels

        print("Epoch {0} loss {1} best_loss {2}".format(current_epoch, total_loss, best_loss))

        current_epoch += 1
        if total_loss < best_loss or best_loss == -1:
            best_loss = total_loss
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == patience:
                break

    model.set_weights(best_weights)
    return model

In [167]:
def train_folds(X, y, X_test, fold_count, batch_size, dgaModel):
    print("="*75)
    fold_size = len(X) // fold_count
    models = []
    result_path = "predictions"
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        print(train_y.shape)

        val_x = np.array(X[fold_start:fold_end])
        val_y = np.array(y[fold_start:fold_end])

        dgaModel.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model = _train_model(dgaModel, batch_size, train_x, train_y, val_x, val_y)
        train_predicts_path = os.path.join(result_path, "train_predicts{0}.npy".format(fold_id))
        test_predicts_path = os.path.join(result_path, "test_predicts{0}.npy".format(fold_id))
        train_predicts = model.predict(X, batch_size=512, verbose=1)
        test_predicts = model.predict(X_test, batch_size=512, verbose=1)
        np.save(train_predicts_path, train_predicts)
        np.save(test_predicts_path, test_predicts)

    return models

In [168]:
def tokenize_hosts(df):
    tk = Tokenizer(char_level=True)
    tk.fit_on_texts(string.ascii_lowercase + string.digits + '-' + '.')
    seq = tk.texts_to_sequences(df)
    tokens = sequence.pad_sequences(seq, maxlen=20)
    return tokens
    

Reading dataset and Tokenizing the host urls

In [180]:
dframe = pd.read_csv('Datasets/dga_data.csv', sep = ',')
dframe = pd.DataFrame(dframe)
df = dframe[["host"]]
tk = Tokenizer(char_level=True)
inv_map = {v: k for k, v in tk.word_index.items()}
tk.fit_on_texts(string.ascii_lowercase + string.digits + '-' + '.')
seq = tk.texts_to_sequences(df)
tokens = sequence.pad_sequences(seq, maxlen=20)
tmp_tokens = []
for tmps in tokens:
    tmp_tokens.append(to_categorical(tmps,39))
temp = np.array(tmp_tokens)
c = temp[int(temp.shape[0] * 0.1):, :, :]
#print(temp.shape)
data_dict =  {'X_train': temp[int(temp.shape[0] * 0.1):, :, :],
        "X_test": temp[:int(temp.shape[0] * 0.1), :, :],
        "word_index": tk.document_count,
        "inv_map": inv_map,
        "legit_domain":df}



In [181]:
df2 = dframe[["isDGA"]]
seq2 = tk.texts_to_sequences(df2)
tokensY = sequence.pad_sequences(seq2, maxlen=20)
tmp_tokensY = []
for tmps in tokensY:
    tmp_tokensY.append(to_categorical(tmps,39))
newtemp = np.array(tmp_tokensY)
data_dict_y =  {'Y_train': newtemp[int(newtemp.shape[0] * 0.1):, :, :],
        "Y_test": newtemp[:int(newtemp.shape[0] * 0.1), :, :],
        "legit_val":df2}

In [184]:
#df = df.loc[df["isDGA"].values == "dga"]
train,test = train_test_split(df, test_size=0.1)
train_new,test_new = tokenize_hosts(train), tokenize_hosts(test)
train_new, test_new= data_dict['X_train'],data_dict['X_test']
ytrain,ytest = train_test_split(df2, test_size=0.1)
y_train_new,y_test_new = tokenize_hosts(ytrain), tokenize_hosts(ytest)
y_train_new, y_test_new= data_dict_y['Y_train'],data_dict_y['Y_test']
print(train_new.shape,test_new.shape,y_train_new.shape,y_test_new.shape)

#train=df.sample(frac=0.8,random_state=200) #random state is a seed value
#test=df.drop(train.index)
#train.head
#dga_host_train = train["host"].values
#dga_host_test = test["host"].values
# y_train = train["isDGA"].values
# print(y_train)

(1, 20, 39) (0, 20, 39) (1, 20, 39) (0, 20, 39)


In [185]:
train_new = np.argmax(train_new, axis=2)
test_new = np.argmax(test_new, axis=2)
y_train_new = np.argmax(y_train_new, axis=2)
y_test_new = np.argmax(y_test_new, axis=2)
X_train = np.array(train_new)
X_test = np.array(test_new)
Y_train = np.array(y_train_new)
Y_test = np.array(y_test_new)


In [171]:
# train_new.shape
X_train.max()
#weight = [tokenized_train]
#print(weight)


20

In [172]:
def __np_sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array

    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
    
def __to_readable_domain(decoded, inv_map):
    domains = []
    for j in range(decoded.shape[0]):
        word = ""
        for i in range(decoded.shape[1]):
            if decoded[j][i] != 0:
                word = word + inv_map[decoded[j][i]]
        domains.append(word)
    return domains

In [173]:
def detokenize(gen_dom):
    sampled = []
    for x in gen_dom:
        word = []
        for y in x:
            word.append(__np_sample(y))
        sampled.append(word)
    readable = __to_readable_domain(np.array(sampled), inv_map=inv_map)
    return readable

In [174]:
dgaModel = get_model(X_train, sentences_length, dropout_rate, recurrent_units, dense_size)
dgaModel.summary()


Model: "CapsDGA"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 20)]              0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 20, 20)            20        
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 20, 20)            0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 20, 256)           115200    
_________________________________________________________________
capsule_6 (Capsule)          (None, 10, 10, 16)        40960     
_________________________________________________________________
flatten_6 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 1600)              0   

In [175]:
dgaModel.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [186]:
print("Starting to train models...")
dgaModel.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=batch_size, epochs=1, verbose = 1)
#models = train_folds(X_train, y_train, X_test, fold_count, batch_size, dgaModel)

Starting to train models...


InvalidArgumentError:  indices[0,16] = 8 is not in [0, 1)
	 [[node CapsDGA/embedding_12/embedding_lookup (defined at <ipython-input-176-6a60030bc82c>:2) ]] [Op:__inference_train_function_43681]

Errors may have originated from an input operation.
Input Source operations connected to node CapsDGA/embedding_12/embedding_lookup:
 CapsDGA/embedding_12/embedding_lookup/40328 (defined at C:\Users\Eshanka\.conda\envs\IITMLsesh\lib\contextlib.py:113)

Function call stack:
train_function
