In [10]:
import os
import numpy as np

data = r"..\dataset"
LIMIT = 5 
X = [np.load(os.path.join(data, x)) for x in os.listdir(data)[:LIMIT] if x.endswith(".npy")]

In [11]:
X[0].shape # each sample represents 1000 API calls, each 102 dims

(1000, 102)

In [12]:
import re
from sklearn.feature_extraction import FeatureHasher

class FeatureType(object):
    ''' Base class from which each feature type may inherit '''

    name = ''
    dim = 0

    def __repr__(self):
        return '{}({})'.format(self.name, self.dim)

    def raw_features(self, input_dict):
        ''' Generate a JSON-able representation of the file '''
        raise (NotImplemented)

    def process_features(self, raw_obj):
        ''' Generate a feature vector from the raw features '''
        raise (NotImplemented)
    
    def process_raw_features(self, raw_obj):
        ''' Generate a feature vector from the raw features '''
        raise (NotImplemented)

    def feature_vector(self, input_dict):
        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
        if there are significant speedups to be gained from combining the two functions. '''
        return self.process_raw_features(self.raw_features(input_dict))


class APIName(FeatureType):
    ''' api_name hash info '''

    name = 'api_name'
    dim = 8

    def __init__(self):
        super(FeatureType, self).__init__()
        self._name = re.compile('^[a-z]+|[A-Z][^A-Z]*')

    def raw_features(self, input_dict):
        """
        input_dict: string
        """
        tmp = self._name.findall(input_dict)
        hasher = FeatureHasher(self.dim, input_type="string").transform([tmp]).toarray()[0]
        return hasher
    
    def process_raw_features(self, raw_obj):
        return raw_obj


class IntInfo(FeatureType):
    ''' int hash info '''

    name = 'int'
    dim = 16

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, input_dict):
        hasher = FeatureHasher(self.dim).transform([input_dict]).toarray()[0]
        return hasher

    def process_raw_features(self, raw_obj):
        return raw_obj


class PRUIInfo(FeatureType):
    ''' Path, Registry, Urls, IPs hash info '''

    name = 'prui'
    dim = 16 + 8 + 12 + 16 + 12

    def __init__(self):
        super(FeatureType, self).__init__()
        self._paths = re.compile('^c:\\\\', re.IGNORECASE)
        self._dlls = re.compile('.+\.dll$', re.IGNORECASE)
        self._urls = re.compile('^https?://(.+?)[/|\s|:]', re.IGNORECASE)
        self._registry = re.compile('^HKEY_')
        self._ips = re.compile('^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

    def raw_features(self, input_dict):
        paths = np.zeros((16,), dtype=np.float32)
        dlls = np.zeros((8,), dtype=np.float32)
        registry = np.zeros((12,), dtype=np.float32)
        urls = np.zeros((16,), dtype=np.float32)
        ips = np.zeros((12,), dtype=np.float32)
        for str_name, str_value in input_dict.items():
            if self._dlls.match(str_value):
                tmp = re.split('//|\\\\|\.', str_value)[:-1]
                tmp = ['\\'.join(tmp[:i]) for i in range(1, len(tmp) + 1)]
                dlls += FeatureHasher(8, input_type="string").transform([tmp]).toarray()[0]
            if self._paths.match(str_value):
                tmp = re.split('//|\\\\|\.', str_value)[:-1]
                tmp = ['\\'.join(tmp[:i]) for i in range(1, len(tmp) + 1)]
                paths += FeatureHasher(16, input_type="string").transform([tmp]).toarray()[0]
            elif self._registry.match(str_value):
                tmp = str_value.split('\\')[:6]
                tmp = ['\\'.join(tmp[:i]) for i in range(1, len(tmp) + 1)]
                registry += FeatureHasher(12, input_type="string").transform([tmp]).toarray()[0]
            elif self._urls.match(str_value):
                tmp = self._urls.split(str_value + "/")[1]
                tmp = tmp.split('.')[::-1]
                tmp = ['.'.join(tmp[:i][::-1]) for i in range(1, len(tmp) + 1)]
                urls += FeatureHasher(16, input_type="string").transform([tmp]).toarray()[0]
            elif self._ips.match(str_value):
                tmp = str_value.split('.')
                tmp = ['.'.join(tmp[:i]) for i in range(1, len(tmp) + 1)]
                ips += FeatureHasher(12, input_type="string").transform([tmp]).toarray()[0]
        return np.hstack([paths, dlls, registry, urls, ips]).astype(np.float32)

    def process_raw_features(self, raw_obj):
        return raw_obj


class StringsInfo(FeatureType):
    ''' Other printable strings hash info '''

    name = 'strings'
    dim = 8

    def __init__(self):
        super(FeatureType, self).__init__()
        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
        self._dlls = re.compile(b'\\.dll', re.IGNORECASE)
        self._urls = re.compile(b'https?://', re.IGNORECASE)
        self._registry = re.compile(b'HKEY_')
        self._mz = re.compile(b'MZ')
        self._ips = re.compile(b'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
        super(FeatureType, self).__init__()

    def raw_features(self, input_dict):
        bytez = '\x11'.join(input_dict.values()).encode('UTF-8', 'ignore')
        allstrings = self._allstrings.findall(bytez)
        if allstrings:
            # statistics about strings:
            string_lengths = [len(s) for s in allstrings]
            avlength = sum(string_lengths) / len(string_lengths)
            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
            as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
            # distribution of characters in printable strings
            csum = c.sum()
            p = c.astype(np.float32) / csum
            wh = np.where(c)[0]
            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
        else:
            avlength = 0
            c = np.zeros((96,), dtype=np.float32)
            H = 0
            csum = 0
        return {
            'numstrings': len(allstrings),
            'avlength': avlength,
            'printables': int(csum),
            'entropy': float(H),
            'paths': len(self._paths.findall(bytez)),
            'dlls': len(self._dlls.findall(bytez)),
            'urls': len(self._urls.findall(bytez)),
            'registry': len(self._registry.findall(bytez)),
            'ips': len(self._ips.findall(bytez)),
            'MZ': len(self._mz.findall(bytez))
        }

    def process_raw_features(self, raw_obj):
        return np.hstack([
            raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
            raw_obj['entropy'], raw_obj['paths'], raw_obj['dlls'], raw_obj['urls'],
            raw_obj['registry'], raw_obj['ips'], raw_obj['MZ']
        ]).astype(np.float32)

features = dict((fe.name, fe) for fe in
                [APIName(), IntInfo(), PRUIInfo(), StringsInfo()])

In [13]:
arguments = {
    "file_attributes": -1,
    "filepath_r": "C:\\Users\\cuckoo\\AppData\\Local\\Temp\\1F60EAF2E2F2E0B2F816BB7EE54D094510D9163F9896937F2711FC7D6E4E192F_7526400.dll.manifest",
    "filepath": "C:\\Users\\cuckoo\\AppData\\Local\\Temp\\1F60EAF2E2F2E0B2F816BB7EE54D094510D9163F9896937F2711FC7D6E4E192F_7526400.dll.manifest"
}

api_parser = APIName()
api_example = "GetFileAttributesW"
api_name_hashed = api_parser.feature_vector(api_example)

api_int_dict, api_str_dict = {}, {}
for c_n, c_v in arguments.items():
    if isinstance(c_v, (list, dict, tuple)):
        continue
    if isinstance(c_v, (int, float)):
        api_int_dict[c_n] = np.log(np.abs(c_v) + 1)
    else:
        if c_v[:2] == '0x':
            continue
        api_str_dict[c_n] = c_v

print(api_int_dict)
print(api_str_dict)

api_int_hashed = features['int'].feature_vector(api_int_dict)
api_prui_hashed = features['prui'].feature_vector(
    api_str_dict)
api_str_hashed = features['strings'].feature_vector(
    api_str_dict)
hashed_feature = np.hstack(
    [api_name_hashed, api_int_hashed, api_prui_hashed, api_str_hashed]).astype(
    np.float32)

print(hashed_feature)
print(hashed_feature.shape)

{'file_attributes': 0.6931471805599453}
{'filepath_r': 'C:\\Users\\cuckoo\\AppData\\Local\\Temp\\1F60EAF2E2F2E0B2F816BB7EE54D094510D9163F9896937F2711FC7D6E4E192F_7526400.dll.manifest', 'filepath': 'C:\\Users\\cuckoo\\AppData\\Local\\Temp\\1F60EAF2E2F2E0B2F816BB7EE54D094510D9163F9896937F2711FC7D6E4E192F_7526400.dll.manifest'}
[ -2.          0.          0.          0.          0.          0.
  -1.          1.          0.         -0.6931472   0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.         -2.          0.          0.          0.
   2.          0.          0.          0.          0.         -2.
   2.         -4.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.        

In [14]:
from keras import Input
from keras.models import Model
from keras.layers import BatchNormalization, Conv1D, Multiply

x = Input(shape=(1000, 102), batch_size=1)
y = BatchNormalization()(x)
m = Model(x, y)

x_0 = Conv1D(128, 2, strides=1, padding='same')(y)
x_1 = Conv1D(128, 2, strides=1, activation="sigmoid", padding='same')(y)
gated_0 = Multiply()([x_0, x_1])
m2 = Model(x, gated_0)

print(m(X[0].reshape(1, 1000, 102)).shape)
print(m2(X[0].reshape(1, 1000,102)).shape)

(1, 1000, 102)
(1, 1000, 128)


In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from functools import reduce
from operator import __add__

class Conv2dSamePadding(nn.Conv1d):
    def __init__(self,*args,**kwargs):
        super().__init__(*args, **kwargs)
        self.zero_pad_2d = nn.ZeroPad2d(reduce(__add__,
                  [(k // 2 + (k - 2 * (k // 2)) - 1, k // 2) for k in self.kernel_size[::-1]]))

    def forward(self, input):
        return  self._conv_forward(self.zero_pad_2d(input), self.weight, self.bias)

class GatedCNN(nn.Module):
    def __init__(
            self,
            ndim=102,
            seq_len=1000,
            conv_out_dim=128,
            lstm_hidden=100,
            dense_hidden=64,
            dropout=0.5,
            num_classes=1
    ):
        super().__init__()
        self.conv1 = Conv2dSamePadding(ndim, conv_out_dim, kernel_size=2)#, stride=1, padding=1)
        self.sig1 = nn.Sigmoid()
        self.conv2 = Conv2dSamePadding(ndim, conv_out_dim, kernel_size=2)#, stride=1, padding=1)
        self.sig2 = nn.Sigmoid()
        
        self.conv3 = Conv2dSamePadding(ndim, conv_out_dim, kernel_size=3)#, stride=1, padding=1)
        self.sig3 = nn.Sigmoid()
        self.conv4 = Conv2dSamePadding(ndim, conv_out_dim, kernel_size=3)#, stride=1, padding=1)
        self.sig4 = nn.Sigmoid()
        
        self.lstm = nn.LSTM(conv_out_dim*2, lstm_hidden, bidirectional=True, batch_first=True)
        self.dense1 = nn.Linear(lstm_hidden*2, dense_hidden)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.dense2 = nn.Linear(dense_hidden, num_classes)
        
        self.batch_norm1 = nn.BatchNorm1d(seq_len)
        self.batch_norm2 = nn.BatchNorm1d(seq_len)

    def forward(self, x):
        """
        Input: (B, L, C) where B - batch size, L - length of sequence, C - feature dim of each sequence element 
        """
        x = self.batch_norm1(x)
        x = torch.permute(x, (0, 2, 1))

        gated_0 = self.conv1(x)
        gated_0 = self.sig1(gated_0)
        gated_0 = gated_0 * self.conv2(x)
        
        gated_1 = self.conv3(x)
        gated_1 = self.sig3(gated_1)
        gated_1 = gated_1 * self.conv4(x)
        
        x = torch.cat([gated_0, gated_1], dim=1)
        x = torch.permute(x, (0, 2, 1))
        x = self.batch_norm2(x)
        
        x, _ = self.lstm(x)
        
        x = torch.max(x, dim=1)[0]
        
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        return x

mt = GatedCNN()
x = torch.Tensor(X[0].reshape(-1, 1000, 102))

mt(x)

tensor([[0.0264]], grad_fn=<AddmmBackward0>)

In [39]:
x = torch.Tensor(np.stack(X))
mt(x)

tensor([[-0.0350],
        [ 0.0240],
        [-0.0925],
        [ 0.1308]], grad_fn=<AddmmBackward0>)