In [None]:
import random
import numpy as np
import torch
import torch as T

In [None]:
class pycoder_parameters:

    ''' Path '''
    Path = 'gen1_33' #'exhaustive_17api' #'seq_30api'

    ''' Core Fuzzing Parameters '''
    NUM_FUZZ_PER_API= 100001 #000
    NUM_TEST_FUZZ = 2
    FLOAT_TENSOR = False #We either generate float or integer tensors
    UNIT_TEST = False
    COMPOSITE = True

    ''' Fuzzing Detailed Parameters '''
    MAX_TENSOR_DIMENSIONS = 3 #how many rows, columns, etc.
    MIN_VAL_PER_DIMENSION = 1 # e.g., min number of rows, columns, etc. 
    MAX_VAL_PER_DIMENSION = 5 # e.g., max number of rows, columns, etc. 

    #So far limiting to integers
    MIN_TENSOR_VALUE = 1
    MAX_TENSOR_VALUE = 15
    

    ''' Embedding Parameters '''
    EMBEDDING_NOISE_LEVEL = 0 #0 noise by default
    EMBEDDING_SIZE = 150
    SHAPE_EMBEDDING_SIZE = 6


    data_type = 'float' if FLOAT_TENSOR is  True else 'integer'
    model_type = 'Composite_' if COMPOSITE is  True else 'Single_'
    file_name = str(model_type) + str(NUM_FUZZ_PER_API) + '_' + data_type
    fuzzing   = file_name + '.pt'
    embedding = file_name + '.embedding' + '.pt',
    classification = file_name + '.model_result' + '.pt' 
    train_valid_test = file_name + 'train_valid_test.pt'

    def setNoiseLevel(self, noise):
        self.EMBEDDING_NOISE_LEVEL = noise
        self.embedding = self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '.pt'

    def getEmbeddingFile(self):
        return(self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '.pt')

    def getVisulizationFile(self):
        return(self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '_' +  'tSNE.pt')

In [None]:
NOISE = 0
f = pycoder_parameters()
f.setNoiseLevel(NOISE)
f.embedding = f.getEmbeddingFile() 
print(f.embedding)
print(f.SHAPE_EMBEDDING_SIZE)

In [None]:
import numpy as np
 
def add_noise(orig_tensor):
    
    orig_tensor = orig_tensor.double()
    data = torch.flatten(orig_tensor).numpy()

    zero_like = torch.flatten(torch.zeros_like(orig_tensor,dtype=bool))

    mask = zero_like.numpy()

    elem_size = np.prod(list(orig_tensor.shape))
    N =  int(elem_size * f.EMBEDDING_NOISE_LEVEL)
    
    # marking first n indexes as true
    mask[:N] = True
    
    # shuffling the mask
    np.random.shuffle(mask)
    
    # applying mask to the data
    data[mask] = 0
    return(torch.Tensor(data))

In [None]:
import torch
import numpy as np
import torch.nn.functional as F

EMBEDDING_SIZE = f.EMBEDDING_SIZE
SHAPE_EMBEDDING_SIZE = f.SHAPE_EMBEDDING_SIZE

def encode_values_to_code(tensor):
    tensor = tensor.clone()
    tensor[(tensor>=100) & (tensor<1000)] = 100
    tensor[(tensor>=1000)] = 101
    tensor[(tensor<=-20) & (tensor>-100)] = -20
    tensor[(tensor<=-100) & (tensor>-1000)] = -21
    tensor[(tensor<=-1000)] = -22
    return tensor


def tensor_flatten_pad(tensor, embed_size=EMBEDDING_SIZE, shape_embed_size=SHAPE_EMBEDDING_SIZE, isNoise=False):
    
    t_flatten = torch.flatten(tensor)

    if isNoise is True:
        t_flatten = add_noise(t_flatten)
    padding_length = embed_size - list(t_flatten.shape)[-1]
    p1d = (0,padding_length) #just padding the last dimension
    t_pad = F.pad(input=t_flatten, pad=p1d, mode='constant', value=0).type(torch.FloatTensor)
    
    type_padding = 0
    if tensor.dtype == torch.bool:
        type_padding = 1
    elif tensor.dtype == torch.float64 \
        or tensor.dtype == torch.double \
        or tensor.dtype == torch.float32 \
        or tensor.dtype == torch.float16:
            type_padding = 2
    

    '''size embedding'''
    if(shape_embed_size > 0):
        t_shape = list(tensor.shape)
        padding_length = shape_embed_size -1 - len(t_shape)
        p1d = (0,padding_length) #just padding the last dimension
        s_pad = F.pad(input=torch.Tensor(t_shape), pad=p1d, mode='constant', value=0).type(torch.float)

        t_pad_list = t_pad.tolist()
        s_pad_list = s_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1] + s_pad_list + [-1])
    
    else:
        t_pad_list = t_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1])
        
    encoded_tensor = encode_values_to_code(tensor_embedding)
    return(encoded_tensor)


In [None]:
all_x_values = []
def remove_duplicate(x):
    for i in all_x_values:
        if torch.equal(i,x) is True:
            return(True)
    return(False)


In [None]:
def load_training_data(dataset):
       
    global EMBEDDING_SIZE
    global api2indx
    
    X=[]
    y=[]
    dict_indx = len(api2indx)
    for data_list in dataset:
        if data_list[-1] == -1:
            continue
        final_output = data_list[-1][1]
        prev_out = torch.Tensor()
        api_seq_x = []
        api_seq_y = []
        for data in data_list:        
            if data == -1:
                continue    
            api = data[0]
            if api2indx.get(api, -1) == -1: 
                api2indx[api] = dict_indx
                dict_indx += 1
                 
            api_indx = api2indx[api]
            input_list = data[2] #.get_input()
            output_tensor = final_output #data.get_output()

            it_pad = []
            for input_tensor in input_list:
                if input_tensor.shape == prev_out.shape and torch.all(input_tensor.eq(prev_out)).item():
                    #same with previous output
                    t = torch.zeros(EMBEDDING_SIZE + SHAPE_EMBEDDING_SIZE + 1 + 2)
                    t[-1] = -1
                    it_pad.append(t)
                else:         
                    #flatten the input tensor
                    it_pad.append(tensor_flatten_pad(input_tensor,isNoise=True))
                
            #adding addidional tensors with zero embeddings for < 2 tensors
            for i in range(len(it_pad),3):
                t = torch.zeros(EMBEDDING_SIZE + SHAPE_EMBEDDING_SIZE + 1 + 2)
                t[-1] = -1
                it_pad.append(t)
                
            ot_pad = tensor_flatten_pad(output_tensor, isNoise=True)
            x = T.flatten(T.stack((it_pad[0],it_pad[1], it_pad[2], ot_pad)))      
            api_seq_x.append(x) 
            api_seq_y.append(api_indx)
            prev_out = data[1]
        X.append(api_seq_x)
        y.append(api_seq_y)
    print(len(X), len(y))
    return(X,y)


In [None]:
def shuffle_dataset(X_dataset, Y_dataset):

    print('len orig_dataset', len(X_dataset),  len(Y_dataset))
  
    length = len(X_dataset)

    idx = list(range(length))  # indices to all elements
    random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
    data_idx = idx[:]

    x = []
    y = []

    x = [X_dataset[i] for i in data_idx]
    y = [Y_dataset[i] for i in data_idx]

    print(len(x), len(y))
    return(x,y)

''' Test '''
x = [1,2,3,4]
y = [5,6,7,8]

x, y = shuffle_dataset(x,y)
print(x)
print(y)

In [None]:
f.setNoiseLevel(0)

In [None]:
api2indx = {}

Traing/Pretraining Embedding Generation

In [None]:
SAVE_FILE = f.fuzzing
for i in range(1,9):
    fuzz_file = f.Path + '/fuzzing_data/' + str(i*10000) + '_' + SAVE_FILE
    embed_file = f.Path + '/training_embeddings/' + str(i*10000) + '_training_embedding.pt' #+ f.embedding
    print(fuzz_file, embed_file)
    l = torch.load(fuzz_file)
    x, y = load_training_data(l)
    torch.save((x,y),embed_file)
    if i == 1:
        torch.save(api2indx, f.Path + '/api2indx.pt')
    print('saving done')
    l.clear()

In [None]:
api2indx

In [None]:
torch.save(api2indx, f.Path + '/api2indx.pt')

In [None]:
SAVE_FILE = f.fuzzing
i = 9
fuzz_file = f.Path + '/fuzzing_data/' + str(i*10000) + '_' + SAVE_FILE
embed_file = f.Path + '/' + str(i*10000) + '_test_embedding.pt' #+ f.embedding
print(embed_file)
l = torch.load(fuzz_file)
x, y = load_training_data(l)
torch.save((x,y),embed_file)
l.clear()
print('saving done')