In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from preprocessing.text import normalize_path, load_csv
from preprocessing.array import fix_length, byte_filter, remap

df = load_csv("data/bohacek_20211022113102.csv") # ~7.5s
df.x = df.x.apply(normalize_path) # 5s
df.x = df.x.str.encode("utf-8", "ignore").apply(lambda x: np.array(list(x), dtype=int)) # 2s

In [2]:
counter = Counter([x for y in df.x.values for x in y]) # 4s
keep_bytes = [x[0] for x in counter.most_common(150)] # 0.5s

df.x = df.x.apply(fix_length) # 7.5s

X_ti = np.stack(df.x.values)
X_ti = byte_filter(X_ti, keep_bytes+[0]) # 3.5s
y_ti = np.stack(df.y)

Loading clean Win 10 system files too:

In [4]:
with open("data/win10_fullfilesystem.txt") as f:
    win10files = f.readlines()

win10files = pd.read_csv("data/win10_fullfilesystem.txt", header=None) # 0.5s
win10files.columns = ["x"]
win10files = win10files.x.apply(normalize_path) # 2s
win10files = win10files.str.encode("utf-8", "ignore").apply(lambda x: np.array(list(x), dtype=int)) # 2s
win10files = win10files.apply(fix_length) # 2s

X_win10files = np.stack(win10files.values)
X_win10files = byte_filter(X_win10files, keep_bytes+[0]) # 3.5s
y_win10files = np.zeros(X_win10files.shape[0])

Stacking all data together and remapping UTF-8 bytes to sequential 0-152 integers (needed from `nn.Embedding`):

In [4]:
print(f"TI data: {y_ti[y_ti==0].shape[0]} benign, {y_ti[y_ti==1].shape[0]} malicious")
print(f"Windows 10 data: {y_win10files.shape[0]} benign")

X = np.vstack([X_ti, X_win10files])
# remapping for embedding: ~ 5 s
X = remap(X, keep_bytes)

y = np.vstack([y_ti.reshape(-1,1), y_win10files.reshape(-1,1)]).squeeze()

print(f"Total: Benign {y[y==0].shape[0]*100/y.shape[0]:.2f} %, Malicious {y[y==1].shape[0]*100/y.shape[0]:.2f} %")

TI data: 141158 benign, 125120 malicious
Windows 10 data: 122410 benign
Total: Benign 67.81 %, Malicious 32.19 %


# Modelling

In [5]:
import random
import time

import torch
from torch import nn, optim
from torch.nn import functional as F

from sklearn.model_selection import train_test_split


def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)


class Model_1st(nn.Module):
    def __init__(self, 
                vocab_size = 152, 
                embedding_dim = 32,
                filter_sizes = [2, 3, 4, 5],
                num_filters = [128, 128, 128, 128],
                num_classes = 2,
                dropout = 0.5):
        super().__init__()

        # embdding
        self.embedding = nn.Embedding(vocab_size, 
                                  embedding_dim, 
                                  padding_idx=0)
        
        # convolutions
        self.conv1d_list = nn.ModuleList([
                            nn.Conv1d(in_channels=embedding_dim,
                                out_channels=num_filters[i],
                                kernel_size=filter_sizes[i])
                            for i in range(len(filter_sizes))
                            ])

        # Fully-connected layers and Dropout
        self.fc_hidden = nn.Linear(np.sum(num_filters), 128)
        self.fc_output = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(p=dropout)

        # Non-linearities
        self.relu = torch.nn.ReLU()
    

    @staticmethod
    def conv_and_max_pool(x, conv):
        """Convolution and global max pooling layer"""
        return F.relu(conv(x).permute(0, 2, 1).max(1)[0])
    

    def forward(self, inputs):
        # Get embeddings from `x`. 
        # Output shape: (b, max_len, embed_dim), 
        # torch.Size([1024, 150, 32])
        embedded = self.embedding(inputs).permute(0, 2, 1)
        # .permute() to change sequence of max_len and embed_dim, so shape is:
        # torch.Size([1024, 32, 150])
        # needed for Conv1D
        
        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv = [self.conv_and_max_pool(embedded, conv1d) for conv1d in self.conv1d_list]
        
        # USED IN PAPER SOMETHING LIKE THIS?
        #x_norm_list = [nn.LayerNorm(x.shape)(x) for x in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = self.dropout(torch.cat(x_conv, dim=1))
        x_h = self.relu(self.fc_hidden(x_fc))
        out = self.fc_output(x_h)
        
        return out


In [8]:
# ====== ENSURING REPRODUCIBILITY =======
SEED = 1763
set_seed(seed_value=SEED)

# ====== DATA PREPARATION =======
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)

BATCH_SIZE = 1024

train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.LongTensor(X_train),torch.LongTensor(y_train)),
    batch_size = BATCH_SIZE, shuffle=True)

val_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.LongTensor(X_val),torch.LongTensor(y_val)),
    batch_size = BATCH_SIZE, shuffle=True)

# ====== MODEL & TRAINING ENVIRONMENT DEFINITION =========

EMBEDDING_DIM = 32
VOCAB = len(keep_bytes) + 2 # + 2 since: 0 - pad, 1 - rare byte
EPOCHS = 4
device = "cpu"

model = Model_1st(vocab_size=VOCAB, embedding_dim=EMBEDDING_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
criterion = nn.CrossEntropyLoss()

# ======== TRAINING & EVAL FUNCTIONS ===========

def train(model, device, train_loader, optimizer, epoch):
    model.train()

    train_accuracy = []
    train_loss = []

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        logits = model(data)
        
        loss = criterion(logits, target)
        train_loss.append(loss.item())
        
        loss.backward() # derivatives
        optimizer.step() # parameter update

        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == target).cpu().numpy().mean() * 100
        train_accuracy.append(accuracy)
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.2f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item(), np.mean(train_accuracy)))

    return train_loss, train_accuracy


def evaluate(model, device, val_loader):
    model.eval()

    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for data, target in val_loader:
        data, target = data.to(device), target.to(device)

        with torch.no_grad():
            logits = model(data)
        
        loss = criterion(logits, target)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == target).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    return val_loss, val_accuracy


# ======== ACTUAL TRAINING ===========

train_losses = []
val_losses = []

for epoch in range(1, EPOCHS + 1):
    t0_epoch = time.time()

    train_loss, train_acc = train(model, device, train_loader, optimizer, epoch) # ~140s per 1024 samples
    train_losses.extend(train_loss)

    if val_loader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_acc = evaluate(model, device, val_loader)
            val_losses.extend(val_loss)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch + 1:^7} | {np.mean(train_loss):^12.6f} | {np.mean(train_acc):^9.2f} | {np.mean(val_loss):^10.6f} | {np.mean(val_acc):^9.2f} | {time_elapsed:^9.2f}")

   2    |   0.539600   |   72.03   |  0.423571  |   79.67   |  442.77  
   3    |   0.450021   |   77.45   |  0.382166  |   81.16   |  421.19  
   4    |   0.415310   |   79.42   |  0.343584  |   83.17   |  417.78  
   5    |   0.395659   |   80.47   |  0.323936  |   85.18   |  418.02  


In [13]:
EPOCHS = 15
for epoch in range(6, EPOCHS + 1):
    t0_epoch = time.time()

    train_loss, train_acc = train(model, device, train_loader, optimizer, epoch) # ~140s per 1024 samples
    train_losses.extend(train_loss)

    if val_loader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_acc = evaluate(model, device, val_loader)
            val_losses.extend(val_loss)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch + 1:^7} | {np.mean(train_loss):^12.6f} | {np.mean(train_acc):^9.2f} | {np.mean(val_loss):^10.6f} | {np.mean(val_acc):^9.2f} | {time_elapsed:^9.2f}")

   7    |   0.378999   |   81.54   |  0.313622  |   85.46   |  425.63  
   8    |   0.367778   |   82.12   |  0.302396  |   86.21   |  419.92  
   9    |   0.358282   |   82.67   |  0.306084  |   85.25   |  431.39  
  10    |   0.349409   |   83.17   |  0.289382  |   86.77   |  427.62  
  11    |   0.341794   |   83.61   |  0.282026  |   87.27   |  426.21  
  12    |   0.335450   |   83.99   |  0.280148  |   87.04   |  423.52  
  13    |   0.329883   |   84.30   |  0.274923  |   87.34   |  426.82  
  14    |   0.323190   |   84.73   |  0.268977  |   87.89   |  416.89  
  15    |   0.318778   |   84.90   |  0.265323  |   88.13   |  425.85  
  16    |   0.313251   |   85.20   |  0.261025  |   88.23   |  432.68  
