In [1]:
"""
Created on Fri Sep 25 14:50:13 2020

The model and preprocessing for estimation of domain name suspiciousness

We use decoded domains in format <tld>.<domain_name> 
Preprocessing part split domain name by bigrams, includind dots
Here we use bigram vocabulary that formed from benign data set.

The model is GRU regression with rmsprop optimizer

@author: marina
"""

import tensorflow as tf
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import json
import time
import sklearn.metrics as ms
#from matplotlib import pyplot as plt

Using TensorFlow backend.


In [2]:
tf.__version__

'2.1.0'

In [3]:
PATH = 'data/'
# Load data (list of domains with labels) from csv file   
#data = pd.read_csv('/home/sungria/py_progs/Domains/rawdata/100k_samples-nn1-bs2_uniq.csv')
data = pd.read_csv(PATH + 'samples-nn1-bs2_uniq.csv')
	
### load bigram vocabulary
bigrams_vocab2 = {}
with open(PATH + 'bigram_vocabulary_all.json') as json_file:
#with open('/home/sungria/py_progs/Domains/bigram_vocabulary_all.json') as json_file:
    bigrams_vocab2 = json.load(json_file)
    
    
# X is input data, string with domain
X = data['domain']
# y is label: 0 - benign, 1 - mailicious
y = data['rep']

In [4]:
X.isnull().values.any()

False

In [5]:
### some useful functions
### all domains should be decoded in human-readable format
def urlDecode(url1):
     try:
          r = url1.encode('utf-8')
          res = r.decode('idna')
     except:
          print ("Can't process domain: ", url1)
          res = ''
     return res

### split domain to bigrams
def findBigrams(input_string):
    '''
    Parameters
    ----------
    input_string : string
    Split domain string to bigrams.

    Returns
    -------
    bigram_list : list
        list of bigrams.

    '''
    bigram_list = []
    for i in range(0, (len(input_string)-1), 1):
        bigram_list.append(input_string[i] + input_string[i+1])
    return bigram_list

### encode bigrams to integers 
def bigrams2int(bigram_list):
    '''
    
    Parameters
    ----------
    bigram_list : list
        Encoding a list of bigrams to the list of integers
        If bigram is not in the dictionary, it replaces with 1 (out of vocabulary token).

    Returns
    -------
    bigram_int : list
        list of integers.

    '''
    bigram_int = []
    for item in bigram_list:
         if item in bigrams_vocab2.keys():
              bigram_int.append(bigrams_vocab2[item])
         else:
              bigram_int.append(int(1))               
    return bigram_int
 
### Domain preprocessing:
### we get data as a domain string, we need to process it to vector format:
def preprocessing(domain_str):
    '''
    Parameters
    ----------
    domain_str : string
        input is a domain string in format <tld>.<domain_name> :
            'com.greycortex'
        We need to transform it to the list of bigrams:
            co, om, m., .g, gr, re, ey, tc, co, or, rt, te, ex
        For embedding layer, we replace each bigram with integer (according to the dictionary).
            850, 469,  91 264, 384, 186, 575, 351, 850, 82, 461, 753, 435  

    Returns
    -------
    bigram_int : list
        A list of integers where each integer corresponds a bigram.
    '''
    ### decoding (if necessary)
    #domain_str = urlDecode(domain_str)
    ### lower case
    domain_low = domain_str.lower()
    ### replace characters: numbers with 0
    domain0 = re.sub('\d', '0', domain_low)
    ### replace non-ascii with ?
    domain_ascii = re.sub(r'[^\.\-0-9a-z]','?', domain0)
    ### create bigrams
    bigrams = findBigrams(domain_ascii) # list of bigram
    ### encode bigram to integer
    int_list = bigrams2int(bigrams) # list of integers
                  
    return int_list
    
# Fit models
def fit_model(X_train, y_train, model, epochs, batch):
   # early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=epochs ,  
                        validation_split=0.2, batch_size=batch, 
                        shuffle = False) #, callbacks = [early_stop])
    return history

In [6]:
### Preprocessing 
# labels should lay in the interval [0, 1]:
y0 = y.map(lambda lb: float((lb + 1)/2))
# convert to array 
y_arr = y0.values

In [7]:
# domain transformation
X_int = X.map(lambda x: preprocessing(x))

In [8]:
X_int.isnull().values.any()

False

In [9]:
# length of each sample
len_seq = X_int.map(lambda seq: len(seq))
### The sequences have different lengths and Keras prefers inputs to be vectorized 
### and all inputs to have the same length. So, we use padding:
max_length = int(round(len_seq.mean() + 3*len_seq.std()))
X_padded = pad_sequences(X_int, maxlen=max_length, padding='post')

In [11]:
### split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_arr, test_size=0.001)
### save to file
#np.save('X_test_01.npy', X_test)
#np.save('y_test_01.npy', y_test)

#np.save('X_train_99.npy', X_train)
#np.save('y_train_99.npy', y_train)

In [12]:
### using mask_zero=True in embedding layer allows flexible input length 
### Create model
### input size =  length of vocabulary + OOV + padding
input_size = len(bigrams_vocab2) + 2

embedding_layer = tf.keras.layers.Embedding(input_size, 32, input_length=max_length, mask_zero=True)

### model
#model_gru64 = tf.keras.Sequential([
#    embedding_layer,
#    tf.keras.layers.GRU(64, dropout=0.2, go_backwards=True),
#    tf.keras.layers.Dense(1, activation='sigmoid')])
#model_gru64.compile(optimizer='rmsprop', loss='mse')

model_lstm64 = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.LSTM(32, dropout=0.2, go_backwards=True),
    tf.keras.layers.Dense(1, activation='sigmoid')])
model_lstm64.compile(optimizer='rmsprop', loss='mse')

In [13]:
## training
#history_gru64 = fit_model(X_train, y_train, model_gru64, epochs=35, batch=128)
history_lstm64 = fit_model(X_train, y_train, model_lstm64, epochs=25, batch=128)
#plot_loss (history_gru64)

Train on 2889016 samples, validate on 722254 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
## prediction
#y_predict_gru64 = model_gru64.predict(X_test)
Y_predict_lstm64 = model_lstm64.predict(X_test)

In [15]:
# save model
from datetime import datetime
now = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
   
saved_model_path = 'models/domain_bigrams-furt-' + format(now)
tf.saved_model.save(model_lstm64, saved_model_path)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: models/domain_bigrams-furt-2020-11-07T23:13:50/assets


INFO:tensorflow:Assets written to: models/domain_bigrams-furt-2020-11-07T23:13:50/assets


In [16]:
Y_predict_lstm64[345]

array([0.06270289], dtype=float32)