In [None]:
import random
from random import choice
import numpy as np
import time
import torch
import torch as T
import torch.nn as nn
import itertools
import sklearn.datasets
import torch.nn.functional as F
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
EMBEDDING_NOISE_LEVEL = 0
EMBEDDING_SIZE = 150
SHAPE_EMBEDDING_SIZE = 6
device = T.device("cuda") 

In [None]:
def add_noise(orig_tensor):
    orig_tensor = orig_tensor.double()
    data = torch.flatten(orig_tensor)
    N =  int(orig_tensor.nelement() * EMBEDDING_NOISE_LEVEL)
    mask = [True] * N + [False] * (orig_tensor.nelement() - N )
    np.random.shuffle(mask)
    data[mask] = 0
    return(data)

def encode_values_to_code(tensor):
    tensor = tensor.clone()
    tensor[(tensor>=100) & (tensor<1000)] = 100
    tensor[(tensor>=1000)] = 101
    tensor[(tensor<=-20) & (tensor>-100)] = -20
    tensor[(tensor<=-100) & (tensor>-1000)] = -21
    tensor[(tensor<=-1000)] = -22
    return tensor

def tensor_flatten_pad(tensor, embed_size=EMBEDDING_SIZE, shape_embed_size=SHAPE_EMBEDDING_SIZE, isNoise=False):
    t_flatten = torch.flatten(tensor)
    if isNoise is True:
        t_flatten = add_noise(t_flatten)
    padding_length = embed_size - t_flatten.shape[-1]
    p1d = (0, padding_length) #just padding the last dimension
    t_pad = F.pad(t_flatten, p1d).float()
    
    type_padding = 0
    if tensor.dtype == torch.bool:
        type_padding = 1
    elif tensor.dtype in {torch.float64, torch.double, torch.float32, torch.float16}:
        type_padding = 2
    
    '''size embedding'''
    if(shape_embed_size > 0):
        t_shape = list(tensor.shape)
        padding_length = shape_embed_size - 1 - len(t_shape)
        p1d = (0, padding_length) #just padding the last dimension
        s_pad = F.pad(torch.Tensor(t_shape), p1d).float()
    
        t_pad_list = t_pad.tolist()
        s_pad_list = s_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1] + s_pad_list + [-1])

    else:
        t_pad_list = t_pad.tolist()
        tensor_embedding = torch.Tensor([type_padding] + [-1] + t_pad_list + [-1])
        
    encoded_tensor = encode_values_to_code(tensor_embedding)
    return(encoded_tensor)

In [None]:
def split_dataset(orig_dataset, train_frac=0.9):
    dataset = orig_dataset.copy()
    train_length = int(len(dataset) * train_frac)
    valid_length = train_length + int((len(dataset) - train_length) / 2)
    random.shuffle(dataset)
    train_set = dataset[:train_length]
    valid_set = dataset[train_length: valid_length]
    test_set = dataset[valid_length:]
    print(len(train_set), len(valid_set), len(test_set))
    return(train_set, valid_set, test_set)

def sample_dataset(orig_dataset, frac=0.9):
    if frac > 1:
        frac_length = frac
    else:
        frac_length = int(len(orig_dataset) * frac)
    idx = random.sample(range(len(orig_dataset)), frac_length)
    return [orig_dataset[i] for i in idx]

In [None]:
def get_empty_t():
    empty_t = torch.zeros(EMBEDDING_SIZE + SHAPE_EMBEDDING_SIZE + 1 + 2)
    empty_t[-1] = -1
    return empty_t

def load_test_data(dataset):
    X, X_alt, y =[], [], []
    empty_t = get_empty_t()
    
    for data_list in dataset:
        if data_list[-1] == -1:
            continue
        final_output = data_list[-1][1]
        ot_pad = tensor_flatten_pad(final_output, isNoise=True)
        prev_out = torch.Tensor().long()
        api_seq_x = []
        api_seq_x_alt = []
        api_seq_y = []    
        
        for api, output, input_list in data_list:
#             if data == -1:
#                 continue  
            if api not in api2indx:
                api2indx[api] = len(api2indx)

            it_pad = []
            it_pad_alt = []
            for input_tensor in input_list:
                if input_tensor.shape == prev_out.shape and torch.all(input_tensor.eq(prev_out)).item():
                    it_pad.append(empty_t)
                else:
                    it_pad.append(tensor_flatten_pad(input_tensor,isNoise=True))
                it_pad_alt.append(tensor_flatten_pad(input_tensor,isNoise=True))
                
            
            # adding addidional tensors with zero embeddings for < 2 tensors
            for i in range(len(it_pad), 3):
                it_pad.append(empty_t)
                it_pad_alt.append(empty_t)
            
            x = T.flatten(T.stack((it_pad[0],it_pad[1], it_pad[2], ot_pad)))      
            x_alt = T.flatten(T.stack((it_pad_alt[0],it_pad_alt[1], it_pad_alt[2], ot_pad)))      
            api_seq_x.append(x) 
            api_seq_x_alt.append(x_alt) 
            api_seq_y.append(api2indx[api])
            prev_out = output

        X.append(api_seq_x)
        X_alt.append(api_seq_x_alt)
        y.append(api_seq_y)
    return(X, X_alt, y)

In [None]:
class FinalEmbedding:
    def __init__(self, x, y):
        self.x_data = x
        self.y_data = y
      
    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        preds = self.x_data[idx]
        trgts = self.y_data[idx] 
        sample = { 
            'predictors' : preds,
            'targets' : trgts
        }
        return sample
    
class RNNModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNNModel, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True, bidirectional=True)   
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim*2, output_size)
    
    def forward(self, x):

        batch_size = x.size(0)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out1 = out.contiguous().view(-1, self.hidden_dim*2)
        out1 = self.fc(out1)
        
        return out1, hidden, out
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers*2, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden
    
class FFNet(T.nn.Module):
    def __init__(self):
        super(FFNet, self).__init__()

        self.hid1 = T.nn.Linear(4*(f.EMBEDDING_SIZE+f.SHAPE_EMBEDDING_SIZE+1+2), 500)
        self.hid2 = T.nn.Linear(500, 250)
        self.hid3 = T.nn.Linear(250, 100)
        self.oupt = T.nn.Linear(100, len(api2indx))

        T.nn.init.xavier_uniform_(self.hid1.weight)
        T.nn.init.zeros_(self.hid1.bias)
        T.nn.init.xavier_uniform_(self.hid2.weight)
        T.nn.init.zeros_(self.hid2.bias)
        T.nn.init.xavier_uniform_(self.oupt.weight)
        T.nn.init.zeros_(self.oupt.bias)

        T.nn.Dropout(p=0.2)

    def forward(self, x):
        z1 = T.tanh(self.hid1(x))
        z2 = T.tanh(self.hid2(z1))
        z3 = T.tanh(self.hid3(z2))
        z = self.oupt(z3)  # no softmax: CrossEntropyLoss() 
        return (z, z3, z2, z1)

In [None]:
def process_dataX(tensor_list):
    empty_t = torch.zeros_like(tensor_list[0])
    a = [empty_t for _ in range(3 - len(tensor_list))]
    io_seq = tensor_list + [empty_t for _ in range(3 - len(tensor_list))]
    return torch.stack(io_seq).unsqueeze(0)

def process_dataY(api_seq):
    eos = api2indx[EOS]
    api_tensors = torch.tensor(api_seq + [eos] * (3 - len(api_seq)))
    return api_tensors.unsqueeze(0)

def embed_tensor_for_model(domain_io):
    x, y = embed_tensors(domain_io)
    X = process_dataX(x[0])
    Y = process_dataY(y[0])
    return(X,Y)

def query_model(X, Y):
    X = process_dataX(X).to(device)
    Y = process_dataY(Y).to(device)
    with torch.no_grad():
        predicts, z3, z2, z1 = net(X)
        model_output, hidden, int_output = rnn_model(z3)

    target_list = Y.cpu().tolist()
    top_indx = torch.argmax(model_output, dim=-1).tolist()
    return int_output.squeeze(), top_indx

def api_edit_distance(seq1, seq2):
    edit_distance = 0
    for a, b in zip(seq1, seq2):
        edit_distance += 1
    return edit_distance

### Get Data

In [None]:
# api2indx
EOS = '<eol>'
api2indx = torch.load('/home/skim131/local/input_output/api2indx.pt')
indx2api = {i:api for api, i in api2indx.items()}
print(f'Loaded {api2indx} apis!')

# model
net = torch.load('/home/skim131/local/input_output/2_train_net_model.pt')
rnn_model = torch.load('/home/skim131/local/input_output/2_train_rnn_model.pt')
print('Loaded model!')

# dataset
dataset = torch.load('/home/skim131/local/input_output/100000_Composite_100001_integer.pt')

sample = sample_dataset(dataset, 20000)
X_data, X_alt_data, y_data = load_test_data(sample)
print(f'Loaded {len(sample)} sample datapoints!')

In [None]:
final_hiddens = {'h1':[], 'h2':[], 'h2p':[]}
ncorrect = 0

for x, x_alt, y in tqdm(zip(X_data, X_alt_data, y_data), total=len(y_data)):
    if len(x) < 2:
        continue
    if y[0] == y[1]:
        continue # removing the same APIs to reduce noise

    hiddens, idx = query_model(x, y)
    hiddens_alt, idx_alt = query_model(x_alt, y)

    if idx[:2] != y or idx_alt[1] != y[1]:
        continue
        
#     ncorrect += 1
    final_hiddens['h1'].append([hiddens[0], y[0]])
    final_hiddens['h2'].append([hiddens[1], y[1]])
    final_hiddens['h2p'].append([hiddens_alt[1], y[1]])
    
print(ncorrect / len(y_data))

### TSNE 

In [None]:
# plot one hidden TSNE in one plot
name = 'h2'
values, labels = zip(*final_hiddens[name])
labels = [indx2api[i] for i in labels]
values = [t.tolist() for t in values]

tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=300)
df = pd.DataFrame()
result = tsne.fit_transform(values)
df['x'] = result[:, 0]
df['y'] = result[:, 1]
df['labels'] = labels

plt.figure(figsize=(16,10))
sns.scatterplot(
    x=f'x', y=f'y',
    hue='labels',
    data=df,
    legend="full",
    alpha=0.3
)

In [None]:
# combine them in one chart
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=300)
df = pd.DataFrame()
names_combined = []
labels_combined = []
values_combined = []

# choose 500 randomly
n = 500
idx_by_name = defaultdict(list)
for i, (_, label_i) in enumerate(final_hiddens['h2']):
    idx_by_name[indx2api[label_i]].append(i)
idx_to_choose = []
for name, idxes in idx_by_name.items():
    idx_to_choose += random.sample(idxes, min(len(idxes), n))

# for name in final_hiddens:
for name in ['h2', 'h2p']:
    lst = [final_hiddens[name][i] for i in idx_to_choose]
    values, labels = zip(*lst)
    names_combined += [name] * len(labels)
    labels_combined += [indx2api[i] for i in labels]
    values_combined += [t.tolist() for t in values]

In [None]:
list(api2indx.keys())

In [None]:
# result = tsne.fit_transform(values_combined)
df['x'] = result[:, 0]
df['y'] = result[:, 1]
df['labels'] = labels_combined
df['hiddens'] = names_combined
df = df.set_index(['labels', 'hiddens'])

### plot ###
palette = sns.color_palette('hls', len(api2indx))
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=f'x', y=f'y',
    hue='labels',
    hue_order=list(api2indx.keys()),
    palette=palette,
    data=df.xs('h2', level=1),
    legend="full",
    marker='x'
)

sns.scatterplot(
    x=f'x', y=f'y',
    hue='labels',
    hue_order=list(api2indx.keys()),
    palette=palette,
    data=df.xs('h2p', level=1),
    legend=None,
    marker='.'
)


### plot some close pairs ###
half = int(len(names_combined) / 2)
data_plot = []
labels_collected = set()
for i in range(half):
    label = labels_combined[i]
    if label in labels_collected:
        continue
    x, y = df.iloc[i].values
    xp, yp = df.iloc[i + half].values
    if 0.2 < math.sqrt((xp - x) ** 2 + (yp - y) ** 2) < 1:
        data_plot.append([label, (x, y), (xp, yp)])
        labels_collected.add(label)

for label, (x,  y), (xp, yp) in data_plot:
#     color = np.random.rand(1, 3)
#     color = np.array([list(palette)[api2indx[label]]])
    color = 'black'
    plt.scatter(x, y, c=color)
    plt.gca().text(x, y, f'{label}', fontsize=12)

    plt.scatter(xp, yp, c=color)
#     plt.gca().text(xp, yp)

In [None]:
# show for individual APIs
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=300)
df = pd.DataFrame()
names_combined = []
labels_combined = []
values_combined = []
for name in final_hiddens:
    values, labels = zip(*final_hiddens[name])
    names_combined += [name] * len(labels)
    labels_combined += [indx2api[i] for i in labels]
    values_combined += [t.tolist() for t in values]
    
    
# choose 500 randomly
n = 300
idx_by_name = defaultdict(list)
for i, label in enumerate(labels_combined):
    idx_by_name[label].append(i)
idx_to_choose = []
for name, idxes in idx_by_name.items():
    idx_to_choose += random.sample(idxes, min(len(idxes), n))
    
names_combined = [names_combined[i] for i in idx_to_choose]
labels_combined = [labels_combined[i] for i in idx_to_choose]
values_combined = [values_combined[i] for i in idx_to_choose]
print(len(values_combined))

In [None]:
result = tsne.fit_transform(values_combined)
df['x'] = result[:, 0]
df['y'] = result[:, 1]
df['labels'] = labels_combined
df['hiddens'] = names_combined
df = df.set_index(['labels', 'hiddens'])
  
fig, axes = plt.subplots(4, 4, figsize=(16, 12))
for i, api in enumerate(list(api2indx)):
    a, b = int(i / 4), i % 4
    try:
        sns.scatterplot(
            ax=axes[a, b],
            x='x', y='y',
            hue='hiddens',
            hue_order=final_hiddens.keys(),
            data=df.xs(api).reset_index(),
            legend='full' if i == 0 else None,
        )
        if i == 0:
            axes[a, b].legend(bbox_to_anchor=(-0.5, 1), fontsize='large')
    except:
        continue
    axes[a, b].set_title(api)