In [1]:
import random
from random import choice
import numpy as np
import time
import torch
from tqdm.notebook import tqdm

In [2]:
EMBEDDING_SIZE = 150
SHAPE_EMBEDDING_SIZE = 6

USE_SHAPE_ENCODING = True
USE_TYPE_ENCODING = True
USE_VALUE_ENCODING = True

FUZZING_DATA_NAME = 'Single_100000_synthetic'
EMBEDDING_DATA_NAME = 'Single_100000_synthetic'

In [3]:
def encode_values_to_code(tensor):
    tensor = tensor.clone()
    tensor[(tensor>=100) & (tensor<1000)] = 100
    tensor[(tensor>=1000)] = 101
    tensor[(tensor<=-20) & (tensor>-100)] = -20
    tensor[(tensor<=-100) & (tensor>-1000)] = -21
    tensor[(tensor<=-1000)] = -22
    return tensor

In [4]:
import sklearn.datasets
import torch
import numpy as np
import torch.nn.functional as F

def tensor_flatten_pad(tensor, embed_size=EMBEDDING_SIZE, shape_embed_size=SHAPE_EMBEDDING_SIZE):
    if not isinstance(tensor, torch.Tensor):
        tensor = torch.tensor(tensor)
    
    t_flatten = torch.flatten(tensor)

    if USE_VALUE_ENCODING:
        t_flatten = encode_values_to_code(t_flatten)

    padding_length = embed_size - list(t_flatten.shape)[-1]
    p1d = (0,padding_length) #just padding the last dimension
    t_pad = F.pad(input=t_flatten, pad=p1d, mode='constant', value=0)
    
    if USE_TYPE_ENCODING:
        type_padding = 0
        if tensor.dtype == torch.bool:
            type_padding = 1
        elif tensor.dtype == torch.float:
            type_padding = 2
    
    if USE_SHAPE_ENCODING:
        t_shape = list(tensor.shape)
        padding_length = shape_embed_size -1 - len(t_shape)
        p1d = (0,padding_length)
        s_pad = F.pad(input=torch.tensor(t_shape), pad=p1d, mode='constant', value=0)

        t_pad_list = t_pad.tolist()
        s_pad_list = s_pad.tolist()
        if USE_TYPE_ENCODING:
            tensor_embedding = torch.tensor([type_padding] + [-1] + t_pad_list + [-1] + s_pad_list + [-1])
        else:
            tensor_embedding = torch.tensor(t_pad_list + [-1] + s_pad_list + [-1])
    
    else:
        t_pad_list = t_pad.tolist()
        if USE_TYPE_ENCODING:
            tensor_embedding = torch.tensor([type_padding] + [-1] + t_pad_list + [-1])
        else:
            tensor_embedding = torch.tensor(t_pad_list + [-1])

    return(tensor_embedding.float())


In [5]:
dataset = torch.load(FUZZING_DATA_NAME+'.pt')

In [6]:
import itertools
from random import sample

def split_dataset(orig_dataset, train_frac=0.9):

    print('Length of original synthetic dataset: {}'.format(len(orig_dataset)))
  
    dataset = orig_dataset

    length = len(dataset)
    train_length = int(length * train_frac)
    valid_length = int((length - train_length) / 2)
    test_length  = length - train_length - valid_length

    idx = list(range(length)) 
    random.shuffle(idx)

    train_idx = idx[:train_length]
    val_idx = idx[train_length:(train_length + valid_length)]
    test_idx = idx[(train_length + valid_length):]

    train_set = [dataset[i] for i in train_idx]
    valid_set = [dataset[i] for i in val_idx]
    test_set = [dataset[i] for i in test_idx]

    print("Training: {}\nValidation: {}\nTest: {}".format(len(train_set), len(valid_set), len(test_set)))
    return(train_set, valid_set, test_set)

train_set, valid_set, test_set = split_dataset(dataset)

Length of original synthetic dataset: 3300
Training: 2970
Validation: 165
Test: 165


In [7]:
api2indx = {}
def encode_dataset(dataset):
       
    global EMBEDDING_SIZE
    global api2indx
    
    X=[]
    y=[]

    dict_indx = len(api2indx)

    for row in tqdm(dataset, total=len(dataset)):
        api = row[5]
        output = row[3]
        it_pad = []

        if api not in api2indx.keys():
            api2indx[api] = dict_indx
            dict_indx += 1
        
        api_indx = api2indx[api]

        input_list = row[:3]

        for input_tensor in input_list:
            if input_tensor is None:
                encoding_size = EMBEDDING_SIZE
                if USE_SHAPE_ENCODING:
                    encoding_size += SHAPE_EMBEDDING_SIZE
                if USE_TYPE_ENCODING:
                    encoding_size += 2
                t = torch.zeros(encoding_size + 1)
                t[-1] = -1
                it_pad.append(t)
            else:
                it_pad.append(tensor_flatten_pad(input_tensor))

        ot_pad = tensor_flatten_pad(output)
        api_seq_x = torch.flatten(torch.stack((it_pad[0], it_pad[1], it_pad[2], ot_pad)))

        X.append(api_seq_x)
        y.append(api)
        
    return(X,y)    

In [8]:
print("Encoding Training Data")
X_train, y_train = encode_dataset(train_set)
print("Encoding Test Data")
X_test, y_test = encode_dataset(test_set)
print('Encoding Validation Data')
X_valid, y_valid = encode_dataset(valid_set)

Encoding Training Data


  0%|          | 0/2970 [00:00<?, ?it/s]

Encoding Test Data


  0%|          | 0/165 [00:00<?, ?it/s]

Encoding Validation Data


  0%|          | 0/165 [00:00<?, ?it/s]

In [10]:
torch.save((api2indx, X_train,y_train,X_test,y_test,X_valid,y_valid), EMBEDDING_DATA_NAME+'.embedding.pt')