In [None]:
import random
from random import choice
import numpy as np
import time
import torch
import torch as T

In [None]:
class pycoder_parameters:

    ''' Path '''
    Path =  'exhaustive_17api' #'seq_30api'

    ''' Core Fuzzing Parameters '''
    NUM_FUZZ_PER_API= 100001 #000
    NUM_TEST_FUZZ = 2
    FLOAT_TENSOR = False #We either generate float or integer tensors
    UNIT_TEST = False
    COMPOSITE = True

    ''' Fuzzing Detailed Parameters '''
    MAX_TENSOR_DIMENSIONS = 3 #how many rows, columns, etc.
    MIN_VAL_PER_DIMENSION = 1 # e.g., min number of rows, columns, etc. 
    MAX_VAL_PER_DIMENSION = 5 # e.g., max number of rows, columns, etc. 

    #So far limiting to integers
    MIN_TENSOR_VALUE = 1
    MAX_TENSOR_VALUE = 15
    

    ''' Embedding Parameters '''
    EMBEDDING_NOISE_LEVEL = 0 #0 noise by default
    EMBEDDING_SIZE = 150
    SHAPE_EMBEDDING_SIZE = 6


    data_type = 'float' if FLOAT_TENSOR is  True else 'integer'
    model_type = 'Composite_' if COMPOSITE is  True else 'Single_'
    file_name = str(model_type) + str(NUM_FUZZ_PER_API) + '_' + data_type
    fuzzing   = file_name + '.pt'
    embedding = file_name + '.embedding' + '.pt',
    classification = file_name + '.model_result' + '.pt' 
    train_valid_test = file_name + 'train_valid_test.pt'

    def setNoiseLevel(self, noise):
        self.EMBEDDING_NOISE_LEVEL = noise
        self.embedding = self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '.pt'

    def getEmbeddingFile(self):
        return(self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '.pt')

    def getVisulizationFile(self):
        return(self.file_name + '.embedding' + '_' + str(self.EMBEDDING_NOISE_LEVEL) + '_' +  'tSNE.pt')

In [None]:
NOISE = 0
f = pycoder_parameters()
f.setNoiseLevel(NOISE)
f.embedding = f.getEmbeddingFile() 
print(f.embedding)
print(f.SHAPE_EMBEDDING_SIZE)

In [None]:
import numpy as np
 
def add_noise(orig_tensor):
    
    orig_tensor = orig_tensor.double()
    data = torch.flatten(orig_tensor).numpy()    
    zero_like = torch.flatten(torch.zeros_like(orig_tensor,dtype=bool))
    mask = zero_like.numpy()

    elem_size = np.prod(list(orig_tensor.shape))
    N =  int(elem_size * f.EMBEDDING_NOISE_LEVEL)
    mask[:N] = True
    
    np.random.shuffle(mask)
    data[mask] = 0
    return(torch.Tensor(data))

t1 = torch.tensor([1,2,3,4,5]) #[[1,2,3]],[[4,5,6]], [[7,8,9]]])
print(t1, add_noise(t1))

In [None]:
import sklearn.datasets
import torch
import numpy as np
import torch.nn.functional as F

EMBEDDING_SIZE = f.EMBEDDING_SIZE
SHAPE_EMBEDDING_SIZE = f.SHAPE_EMBEDDING_SIZE

def encode_values_to_code(tensor):
    tensor = tensor.clone()
    tensor[(tensor>=100) & (tensor<1000)] = 100
    tensor[(tensor>=1000)] = 101
    tensor[(tensor<=-20) & (tensor>-100)] = -20
    tensor[(tensor<=-100) & (tensor>-1000)] = -21
    tensor[(tensor<=-1000)] = -22
    return tensor


def tensor_flatten_pad(tensor, embed_size=EMBEDDING_SIZE, shape_embed_size=SHAPE_EMBEDDING_SIZE, isNoise=False):
    
    t_flatten = torch.flatten(tensor)

    if isNoise is True:
        t_flatten = add_noise(t_flatten)
    padding_length = embed_size - list(t_flatten.shape)[-1]
    p1d = (0,padding_length) #just padding the last dimension
    t_pad = F.pad(input=t_flatten, pad=p1d, mode='constant', value=0).type(torch.FloatTensor)
    
    type_padding = 0
    if tensor.dtype == torch.bool:
        type_padding = 1
    elif tensor.dtype == torch.float64 \
        or tensor.dtype == torch.double \
        or tensor.dtype == torch.float32 \
        or tensor.dtype == torch.float16:
            type_padding = 2
    
    
    '''size embedding'''
    if(shape_embed_size > 0):
        t_shape = list(tensor.shape)
        padding_length = shape_embed_size -1 - len(t_shape)
        p1d = (0,padding_length) #just padding the last dimension
        s_pad = F.pad(input=torch.Tensor(t_shape), pad=p1d, mode='constant', value=0).type(torch.float)

        t_pad_list = t_pad.tolist()
        s_pad_list = s_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1] + s_pad_list + [-1])
    
    else:
        t_pad_list = t_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1])
        
    encoded_tensor = encode_values_to_code(tensor_embedding)
    return(encoded_tensor)


In [None]:
import itertools
from random import sample

def split_dataset(orig_dataset, train_frac=0.9):

    print('len orig_dataset', len(orig_dataset))
  
    dataset =  orig_dataset #sample(orig_dataset,len(orig_dataset)//10)

    print('len dataset', len(dataset))

    length = len(dataset)
    train_length = int(length * train_frac)
    valid_length = int((length - train_length) / 2)
    test_length  = length - train_length - valid_length

    print(train_length, valid_length, test_length)

    idx = list(range(length))  # indices to all elements
    random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
    train_idx = idx[:train_length]
    val_idx = idx[train_length:(train_length + valid_length)]
    test_idx = idx[(train_length + valid_length):]

    train_set = [dataset[i] for i in train_idx]
    valid_set = [dataset[i] for i in val_idx]
    test_set = [dataset[i] for i in test_idx]


    print(len(train_set), len(valid_set), len(test_set))
    return(train_set, valid_set, test_set)


In [None]:
import itertools
from random import sample

def sample_dataset(orig_dataset, frac=0.9):

    print('len orig_dataset', len(orig_dataset))
  
    dataset =  orig_dataset #sample(orig_dataset,len(orig_dataset)//10)

    print('len dataset', len(dataset))

    length = len(dataset)
    frac_length = int(length * frac)

    idx = list(range(length))  # indices to all elements
    random.shuffle(idx)  # in-place shuffle the indices to facilitate random splitting
    frac_idx = idx[:frac_length]
    frac_set = [dataset[i] for i in frac_idx]
    return(frac_set)


In [None]:
def load_test_data(dataset):
       
    global EMBEDDING_SIZE
    global api2indx
    
    X=[]
    X_alt = []
    y=[]
    dict_indx = len(api2indx)
    for data_list in dataset:
        if data_list[-1] == -1:
            continue
        final_output = data_list[-1][1]
        prev_out = torch.Tensor()
        api_seq_x = []
        api_seq_x_alt = []
        api_seq_y = []
        for data in data_list:        
            if data == -1:
                continue    
            api = data[0]
            if api2indx.get(api, -1) == -1: 
                api2indx[api] = dict_indx
                dict_indx += 1
                 
            api_indx = api2indx[api]
            input_list = data[2] #.get_input()
            output_tensor = final_output #data.get_output()

            it_pad = []
            it_pad_alt = []
            for input_tensor in input_list:
                if input_tensor.shape == prev_out.shape and torch.all(input_tensor.eq(prev_out)).item():
                    #same with previous output
                    t = torch.zeros(EMBEDDING_SIZE + SHAPE_EMBEDDING_SIZE + 1 + 2)
                    t[-1] = -1
                    it_pad.append(t)
                else:         
                    #flatten the input tensor
                    it_pad.append(tensor_flatten_pad(input_tensor,isNoise=True))

                it_pad_alt.append(tensor_flatten_pad(input_tensor,isNoise=True))
                
            
            #adding addidional tensors with zero embeddings for < 2 tensors
            for i in range(len(it_pad),3):
                t = torch.zeros(EMBEDDING_SIZE + SHAPE_EMBEDDING_SIZE + 1 + 2)
                t[-1] = -1
                it_pad.append(t)
                it_pad_alt.append(t)
            
            ot_pad = tensor_flatten_pad(output_tensor, isNoise=True)
            
            x = T.flatten(T.stack((it_pad[0],it_pad[1], it_pad[2], ot_pad)))      
            api_seq_x.append(x) 
            api_seq_y.append(api_indx)

            x_alt = T.flatten(T.stack((it_pad_alt[0],it_pad_alt[1], it_pad_alt[2], ot_pad)))      
            api_seq_x_alt.append(x_alt) 


            prev_out = data[1]

        X.append(api_seq_x)
        X_alt.append(api_seq_x_alt)
        y.append(api_seq_y)
    print(len(X), len(y))
    return(X,X_alt, y)

In [None]:
api2indx = torch.load(f.Path + '/api2indx.pt')

In [None]:
SAVE_FILE = f.fuzzing
i = 10
fuzz_file = f.Path + '/fuzzing_data/' + str(i*10000) + '_' + SAVE_FILE
embed_file = f.Path + '/' + str(i*10000) + '_test_embedding.pt' #+ f.embedding
print(embed_file)
l = torch.load(fuzz_file)

In [None]:
test_set = sample_dataset(l,0.02)

In [None]:
len(test_set)

In [None]:
class FinalEmbedding:
    def __init__(self,x,y):
        self.x_data = x
        self.y_data = y
      
    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        preds = self.x_data[idx]
        trgts = self.y_data[idx] 
        sample = { 
        'predictors' : preds,
        'targets' : trgts
        }
        return sample

In [None]:
import torch
from torch import nn
import numpy as np
device = T.device("cuda") 

In [None]:
def process_dataX(tensor_list):
    io_seq = []
    
    n0 = tensor_list[0]

    if(len(tensor_list) == 1):
        n1 = torch.zeros(n0.shape)
        n2 = torch.zeros(n0.shape)
        
    elif(len(tensor_list) == 2):
        n1 = tensor_list[1]
        n2 = torch.zeros(n0.shape)

    elif(len(tensor_list) == 3):
        n1 = tensor_list[1]
        n2 = tensor_list[2]

    new_list = torch.stack((n0, n1, n2))
    io_seq.append(new_list)
    return(torch.stack(io_seq))

In [None]:
indx2api = {}
EOS = '<eol>'

def process_dataY(api_seq):
    global indx2api
    global api2indx

    ''' Add <eol> to the dictionary '''
    indx2api = {v: k for k, v in api2indx.items()}

    if api2indx.get(EOS, -1) == -1:
        max_key = max(indx2api.keys())
        print(max_key)
        indx2api[max_key+1] = EOS
        api2indx[EOS] = max_key+1

    eos = api2indx[EOS]
    api_tensors = []
    api0 = api_seq[0]

    if len(api_seq) == 1:
        api1 = eos
        api2 = eos

    elif len(api_seq) == 2:
        api1 = api_seq[1]
        api2 = eos

    elif len(api_seq) == 3:
        api1 = api_seq[1]
        api2 = api_seq[2]

    else:
        print('!!! Not supposed to be here')

    t = torch.tensor([api0, api1, api2])
    api_tensors.append(t)
    
    return(torch.stack(api_tensors))

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNNModel, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True, bidirectional=True)   
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim*2, output_size)
    
    def forward(self, x):

        batch_size = x.size(0)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out1 = out.contiguous().view(-1, self.hidden_dim*2)
        out1 = self.fc(out1)
        
        return out1, hidden, out
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers*2, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

In [None]:
class FFNet(T.nn.Module):
  def __init__(self):
    super(FFNet, self).__init__()

    self.hid1 = T.nn.Linear(4*(f.EMBEDDING_SIZE+f.SHAPE_EMBEDDING_SIZE+1+2), 500)
    self.hid2 = T.nn.Linear(500, 250)
    self.hid3 = T.nn.Linear(250, 100)
    self.oupt = T.nn.Linear(100, len(api2indx))

    T.nn.init.xavier_uniform_(self.hid1.weight)
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight)
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight)
    T.nn.init.zeros_(self.oupt.bias)

    T.nn.Dropout(p=0.2)


  def forward(self, x):
    z1 = T.tanh(self.hid1(x))
    z2 = T.tanh(self.hid2(z1))
    z3 = T.tanh(self.hid3(z2))
    z = self.oupt(z3)  # no softmax: CrossEntropyLoss() 
    return (z, z3, z2, z1)

In [None]:
DEBUG = False

In [None]:
def embed_tensor_for_model(domain_io):

    x, y = embed_tensors(domain_io)

    X = process_dataX(x[0])
    Y = process_dataY(y[0])
    ds = FinalEmbedding(X,Y)

    X = ds[0]['predictors'].to(device)
    Y = ds[0]['targets'].to(device)  # [0] [1] or [2]

    return(X,Y)

In [None]:
def beam_search(top3_list):

    api_seq = []
    api_seq1 = []
    for i in top3_list[0]:
        for j in top3_list[1]:
            api_seq1.append(((i[0], j[0]), i[1]*j[1]))
    
    api_seq1.sort(key = lambda x: x[1], reverse=True) 

    for k in top3_list[2]:
        for s1 in api_seq1:
            api_seq.append(((s1[0][0], s1[0][1], k[0]), s1[1]*k[1]))

    api_seq.sort(key = lambda x: x[1], reverse=True) 
    
    return(api_seq)
            

In [None]:
def query_model(X, Y):

    with T.no_grad():
      start_time = time.time()
      predicts, z3, z2, z1 = net(X)
      temp_z3 = torch.unsqueeze(z3,0)
      model_output, hidden, int_output = rnn_model(temp_z3)

      target_list = list(Y.cpu().numpy())
    
    h = (int_output,target_list,temp_z3)

    top_indx = []

    for m in model_output:
        prob = nn.functional.softmax(m, dim=0).data
        # Taking the class with the highest probability score from the output
        api_ind = torch.max(prob, dim=0)[1].item()
        top_indx.append(api_ind)
        if DEBUG:
          print(indx2api[api_ind])
      
    ed = 0

    return(h, top_indx)


In [None]:
def api_edit_distance(seq1, seq2):

    edit_distantce = 0

    for i in range(len(seq1)):
        if seq1[i] != seq2[i]:
            edit_distantce += 1

    return(edit_distantce)

In [None]:
net = torch.load('net_model.pt')
rnn_model = torch.load('rnn_model.pt')

In [None]:
x, x_alt, y = load_test_data(test_set)

In [None]:
def validation_fos(x, x_alt, y):

    total_query = 0
    ncorrect = 0

    for i in range(0,len(x_alt)):
        for j in range(0,len(y[i])):
            test_set_x1 = process_dataX([x_alt[i][j]])
            test_set_y1 = process_dataY([y[i][j]])
            ds = FinalEmbedding(test_set_x1,test_set_y1)
            X1 = ds[0]['predictors'].to(device)
            Y1 = ds[0]['targets'].to(device)
            model_output1, top_index1 = query_model(X1,Y1)
            total_query += 1
            if top_index1[0] == Y1[0]:
                ncorrect += 1

            
    print (total_query, ncorrect, ncorrect/total_query)

In [None]:
validation_fos(x, x_alt, y)

In [None]:
import scipy

In [None]:
from scipy.spatial import distance

final_hiddens = []

#comparing h2 of generative model with h1 of first in seq model
hidden_states0 = []
hidden_states1 = []
y_axis = []

ncorrect = 0
sim_index = [0,0,0]

similar = []
dissimilar = []

for i in range(0,len(x)):
    if len(x[i]) < 2:
        continue
    if y[i][0] == y[i][1]:
        continue #removing the same APIs to reduce noise
    test_set_x = process_dataX(x[i])
    test_set_y = process_dataY(y[i])
    ds = FinalEmbedding(test_set_x,test_set_y)
    X = ds[0]['predictors'].to(device)
    Y = ds[0]['targets'].to(device)
    model_output0, top_index0 = query_model(X,Y)

    if top_index0[1] != Y[1]:
        continue
    else:
        ncorrect += 1
    
    test_set_x1 = process_dataX([x_alt[i][1]])
    test_set_y1 = process_dataY([y[i][1]])
    ds = FinalEmbedding(test_set_x1,test_set_y1)
    X1 = ds[0]['predictors'].to(device)
    Y1 = ds[0]['targets'].to(device)
    model_output1, top_index1 = query_model(X1,Y1)
    
    if top_index1[0] != Y1[0]:
        continue

    hiddens0 = model_output0[0].cpu()#.tolist()
    hiddens1 = model_output1[0].cpu()#.tolist()

    hiddens0 = hiddens0[0]
    hiddens1 = hiddens1[0]

    hidden_states0.append(hiddens0[1])
    hidden_states1.append(hiddens1[0])
    y_axis.append(Y1[0].item())

    final_hiddens.append((hiddens0[0], hiddens0[1], hiddens0[2], hiddens1[0], Y1[0].item()))

    results = []
    for k in [0,1,2]:
        result = 1 - distance.cosine(hiddens0[k], hiddens1[0])
        results.append(result)
        if k == 1:
            similar.append(result)
        elif k==0:
            dissimilar.append(result)

    max_value = max(results)
    max_index = results.index(max_value)
    sim_index[max_index] += 1

In [None]:
torch.save(final_hiddens,'seq2_33/final_hiddens.pt')

In [None]:
''' Report h3 and h3' '''
from scipy.spatial import distance

final_hiddens = []

#comparing h2 of generative model with h1 of first in seq model
hidden_states0 = []
hidden_states1 = []
y_axis = []

ncorrect = 0
sim_index = [0,0,0]
n = 2

similar = []
dissimilar0 = []
dissimilar1 = []

for i in range(0,len(x)):
    if len(x[i]) < 3:
        continue
    if y[i][n-1] == y[i][n]:
        continue #removing the same APIs to reduce noise
    test_set_x = process_dataX(x[i])
    test_set_y = process_dataY(y[i])
    ds = FinalEmbedding(test_set_x,test_set_y)
    X = ds[0]['predictors'].to(device)
    Y = ds[0]['targets'].to(device)
    model_output0, top_index0 = query_model(X,Y)

    if top_index0[n] != Y[n]:
        continue
    else:
        ncorrect += 1
    
    test_set_x1 = process_dataX([x_alt[i][n]])
    test_set_y1 = process_dataY([y[i][n]])

    ds = FinalEmbedding(test_set_x1,test_set_y1)
    X1 = ds[0]['predictors'].to(device)
    Y1 = ds[0]['targets'].to(device)
    model_output1, top_index1 = query_model(X1,Y1)

    if top_index1[0] != top_index0[2]:
        continue

    hiddens0 = model_output0[0].cpu()#.tolist()
    hiddens1 = model_output1[0].cpu()#.tolist()

    hiddens0 = hiddens0[0]
    hiddens1 = hiddens1[0]

    hidden_states0.append(hiddens0[n])
    hidden_states1.append(hiddens1[0])
    y_axis.append(Y1[0].item())

    final_hiddens.append((hiddens0[0], hiddens0[1], hiddens0[2], hiddens1[0], Y1[0].item()))

    results = []
    for k in [0,1,2]:
        result = 1 - distance.cosine(hiddens0[k], hiddens1[0])
        results.append(result)
        if k == n:
            similar.append(result)
        elif k==0:
            dissimilar0.append(result)
        elif k==1:
            dissimilar1.append(result)

    max_value = max(results)
    max_index = results.index(max_value)

    sim_index[max_index] += 1


In [None]:
scipy.stats.ttest_rel(dissimilar,similar)

In [None]:
np.median(dissimilar0),  np.median(dissimilar1), np.median(similar)

In [None]:
def pad_or_truncate(some_list, target_len):
    return some_list[:target_len] + [0]*(target_len - len(some_list))

In [None]:
from scipy import spatial

similar = []
dissimilar = []

for i in range(0,len(hidden_states0)):
    for j in range(0,len(hidden_states1)):
        result = 1 - spatial.distance.cosine(hidden_states0[i], hidden_states1[j])
        print(i, j, result)
        if i == j:
            similar.append(result)
        else:
            dissimilar.append(result)
        #results[i][j] = result

In [None]:
from scipy.stats import ttest_ind
import scipy.stats

In [None]:
t = scipy.stats.ttest_ind(similar, dissimilar)
t