In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sys
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_FILE = '/content/drive/My Drive/DgaDetect-master/data/traindata.pkl'

def init_data():
    data = pickle.load(open(DATA_FILE, 'rb'))
    X = data["text"]
    labels = data["label"]
    # Dictionary of valid char [a-z0-9][-_]
    valid_chars = {'7': 1, 'i': 2, 'x': 3, '5': 4, 'w': 5, 't': 6, 'v': 7, 'g': 8, 'k': 9, 'd': 10, 'z': 11, '6': 12, '-': 13, '_': 14, 'a': 15, 'p': 16, 'e': 17, '9': 18, 'b': 19, 'f': 20, 'y': 21, '2': 22, 'c': 23, 'l': 24, 's': 25, 'n': 26, 'h': 27, '3': 28, 'u': 29, 'm': 30, '0': 31, 'r': 32, 'j': 33, '8': 34, 'o': 35, '4': 36, '1': 37, 'q': 38}
    max_features = len(valid_chars) + 1
    max_len = np.max([len(x) for x in X])
    # Use dic[valid_chars] to transfer char to int
    X = [[valid_chars[y] for y in x] for x in X]
    # Padding
    X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len, padding='pre')
    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    return np.array(X), np.array(y).reshape(len(y),1), max_features, max_len


X, Y, max_features, max_len = init_data()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, shuffle=True)

train = torch.utils.data.TensorDataset(torch.LongTensor(x_train), torch.LongTensor(y_train))

test = torch.utils.data.TensorDataset(torch.LongTensor(x_test[:21000]), torch.LongTensor(y_test[:21000]))

class Arguments():
    def __init__(self):
        self.batch_size = 50
        self.test_batch_size = 1000
        self.epochs = 30
        self.lr = 0.001
        self.momentum = 0.5
        self.no_cuda = False
        self.seed = 1
        self.log_interval = 30

args = Arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


train_loader = torch.utils.data.DataLoader( train,
    batch_size=args.batch_size, shuffle=True, **kwargs)



test_loader = torch.utils.data.DataLoader(test,
    batch_size=args.test_batch_size, shuffle=True, **kwargs)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.word_embeddings = nn.Embedding(39, 128)
        self.lstm = nn.LSTM(128, 128, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.linear1 = nn.Linear(47*256, 100)
        self.linear2 = nn.Linear(100, 1)

    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))


    def forward(self, x):
        embedded = self.word_embeddings(x)
        x, (ht, ct) = self.lstm(embedded)
        x = x.contiguous().view(-1, 47*256)
        x = F.relu(x)
        x = self.dropout(x)
        x = F.relu(self.linear1(x))
        x = F.sigmoid(self.linear2(x))
        return x

model = Net().to(device)

optimizer=optim.Adam(params=model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', verbose=True, patience=2, factor=0.5)

def train(args, model, device, train_loader, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.binary_cross_entropy(output.squeeze(), target.float())
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size,
                100. * batch_idx / len(train_loader), loss.item()))

def test(args, model, device, test_loader, max_accuracy, epochs_no_improve):
    model.eval()
    test_loss = 0
    correct = 0
    pred_list = []
    test_list = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.binary_cross_entropy(output.squeeze(), target.float(), reduction='sum').item() # sum up batch loss
            pred = torch.round(output.squeeze())
            correct += pred.eq(target.float().view_as(pred)).sum().item()
            pred_list.append(pred.numpy())
            test_list.append(target.view_as(pred).numpy())

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    accuracy = 100. * correct / len(test_loader.dataset)
    if accuracy > max_accuracy:
        epochs_no_improve = 0
        max_accuracy = accuracy
    else:
        epochs_no_improve += 1

    scheduler.step(100. * correct / len(test_loader.dataset))

    return pred_list, test_list, max_accuracy, epochs_no_improve


max_accuracy = 0.0
epochs_no_improve = 0  
for epoch in range(1, args.epochs + 1):
    train(args, model, device, train_loader, epoch)
    pred_list, test_list, max_accuracy, epochs_no_improve = test(args, model, device, test_loader, max_accuracy, epochs_no_improve)
    torch.save(model.state_dict(), "/content/drive/My Drive/Colab Notebooks/models_storage/pytorch_models/pytorch_Bidirectional_LSTM_binary/"+str(epoch)+".pt")
    if epochs_no_improve > 3:
       print('Early stopping!')
       break


pred_list = [a.squeeze().tolist() for a in pred_list]
total = []
for i in pred_list:
    total += i
pred_list = total

test_list = [a.squeeze().tolist() for a in test_list]
total = []
for i in test_list:
    total += i
test_list = total

pred_list = ['benign' if x==0 else 'dga' for x in pred_list]
test_list = ['benign' if x==0 else 'dga' for x in test_list]

from sklearn.metrics import classification_report
print(classification_report(test_list, pred_list))

from sklearn.metrics import confusion_matrix
import seaborn as sns
confusion_matrix_df = pd.DataFrame(confusion_matrix(test_list, pred_list))
print(confusion_matrix_df)
#sns.heatmap(confusion_matrix_df, annot=True)


Test set: Average loss: 0.0677, Accuracy: 20492/21000 (97.58%)


Test set: Average loss: 0.0536, Accuracy: 20607/21000 (98.13%)


Test set: Average loss: 0.0540, Accuracy: 20597/21000 (98.08%)


Test set: Average loss: 0.0507, Accuracy: 20638/21000 (98.28%)


Test set: Average loss: 0.0477, Accuracy: 20657/21000 (98.37%)


Test set: Average loss: 0.0464, Accuracy: 20672/21000 (98.44%)


Test set: Average loss: 0.0523, Accuracy: 20668/21000 (98.42%)


Test set: Average loss: 0.0541, Accuracy: 20672/21000 (98.44%)


Test set: Average loss: 0.0474, Accuracy: 20679/21000 (98.47%)


Test set: Average loss: 0.0534, Accuracy: 20667/21000 (98.41%)


Test set: Average loss: 0.0603, Accuracy: 20678/21000 (98.47%)


Test set: Average loss: 0.0512, Accuracy: 20664/21000 (98.40%)

Epoch    12: reducing learning rate of group 0 to 5.0000e-04.

Test set: Average loss: 0.0621, Accuracy: 20697/21000 (98.56%)


Test set: Average loss: 0.0785, Accuracy: 20693/21000 (98.54%)


Test set: Average loss: 0.0