In [1]:
import random
from random import choice
import numpy as np
import time
import torch as T
import torch
import torch.nn.functional as F
device = T.device("cpu") 

In [2]:
EMBEDDING_SIZE = 150
SHAPE_EMBEDDING_SIZE = 6

USE_SHAPE_ENCODING = True
USE_TYPE_ENCODING = True
USE_VALUE_ENCODING = True

EMBEDDING_DATA_NAME = 'Single_100000_synthetic'

In [3]:
api2indx, train_set_x0, train_set_y0, valid_set_x0, valid_set_y0, test_set_x0, test_set_y0 = torch.load(EMBEDDING_DATA_NAME + ".embedding.pt")

In [4]:
# Merge encoddings into a tensor
train_set_x = torch.stack(train_set_x0)
valid_set_x = torch.stack(valid_set_x0)
test_set_x = torch.stack(test_set_x0)

In [5]:
# To reduce RAM usage
del train_set_x0
del valid_set_x0
del test_set_x0

In [6]:
# List of labels (i.e., API function names) into a 1-d tensor of indices
updated_api2indx = {}
def convert_labels_to_tensor(y):
    dict_indx = 0
    y1 = []
    for api in y:
        if api not in updated_api2indx.keys():
            updated_api2indx[api] = dict_indx
            dict_indx += 1
        y1.append(updated_api2indx[api])
    
    npa = np.asarray(y1)
    return(torch.from_numpy(npa).type(torch.LongTensor))

In [7]:
train_set_y = convert_labels_to_tensor(train_set_y0)
valid_set_y = convert_labels_to_tensor(valid_set_y0)
test_set_y = convert_labels_to_tensor(test_set_y0)

In [8]:
del train_set_y0
del valid_set_y0
del test_set_y0

In [None]:
# Print the shape of training set - (n * encodding size, n)
print(train_set_x.shape, train_set_y.shape)

In [10]:
class Net(T.nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    T.manual_seed(0)
    T.backends.cudnn.deterministic = True
    T.backends.cudnn.benchmark = False
    np.random.seed(0)
    
    first_layer_size = EMBEDDING_SIZE
    if USE_SHAPE_ENCODING:
      first_layer_size += SHAPE_EMBEDDING_SIZE
    if USE_TYPE_ENCODING:
      first_layer_size += 2
    self.hid1 = T.nn.Linear(4*(first_layer_size+1), 500)
    self.hid2 = T.nn.Linear(500, 250)
    self.hid3 = T.nn.Linear(250, 100)
    self.oupt = T.nn.Linear(100, len(api2indx))

    T.nn.init.xavier_uniform_(self.hid1.weight)
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight)
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight)
    T.nn.init.zeros_(self.oupt.bias)

    T.nn.Dropout(p=0.2)

  def forward(self, x):
    z1 = T.tanh(self.hid1(x))
    z2 = T.tanh(self.hid2(z1))
    z3 = T.tanh(self.hid3(z2))
    z = self.oupt(z3)
    return (z, z3, z2, z1)

In [11]:
def accuracy(model, ds):
  # to be run after model.eval()
  # granular but slow approach
  n_correct = 0; n_wrong = 0
  for i in range(len(ds)):
    X = ds[i]['predictors'].to(device) # predicted label
    Y = ds[i]['targets'].to(device) # ground truth
    with T.no_grad():
      oupt, z3, z2, z1 = model(X)

    prediction = T.argmax(oupt)
    if prediction == Y:
      n_correct += 1
    else:
      n_wrong += 1

  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc

In [12]:
class FinalEmbedding:
    def __init__(self,x,y):
        self.x_data = x
        self.y_data = y
      
    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        preds = self.x_data[idx]
        trgts = self.y_data[idx] 
        sample = { 
        'predictors' : preds,
        'targets' : trgts
        }
        return sample

In [None]:
# 0. get started
print("\nBegin predict  \n")
np.random.seed(1)
T.manual_seed(1)

In [None]:
# 1. create DataLoader objects
print("Creating Final Datasets ")

bat_size = 128

train_ds = FinalEmbedding(train_set_x,train_set_y)
test_ds = FinalEmbedding(test_set_x,test_set_y)
valid_ds = FinalEmbedding(valid_set_x,valid_set_y)

train_ldr = T.utils.data.DataLoader(train_ds,batch_size=bat_size, shuffle=True)

In [17]:
# 2. create network
net = Net().to(device)

# 3. train model
max_epochs = 50
ep_log_interval = 10
list_loss = []

In [18]:
import torch.nn as nn

lrn_rate = 0.001
loss_func = T.nn.CrossEntropyLoss()  # apply log-softmax()
optimizer = T.optim.SGD(net.parameters(), lr=lrn_rate)

print("\nbat_size = %3d " % bat_size)
print("loss = " + str(loss_func))
print("optimizer = SGD")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.3f " % lrn_rate)

print("\nStarting train with saved checkpoints")
net.train()
for epoch in range(0, max_epochs+1):
  T.manual_seed(1 + epoch)  # recovery reproducibility
  epoch_loss = 0  # for one full epoch
  for (batch_idx, batch) in enumerate(train_ldr):
    X = batch['predictors']  # inputs
    Y = batch['targets']     # shape [10,3] (!)

    optimizer.zero_grad()
    oupt, z3, z2, z1 = net(X.to(device))

    loss_val = loss_func(oupt, Y.to(device))  # avg loss in batch
    epoch_loss += loss_val.item()  # sum of averages
    loss_val.backward()
    optimizer.step()
    list_loss.append(loss_val.item())

  if epoch % ep_log_interval == 0:
      net.eval()
      acc_valid = accuracy(net, valid_ds)  # item-by-item
      print("epoch = %4d   loss = %0.4f valid_data accuracy = %0.4f" % \
      (epoch, epoch_loss, acc_valid))      


bat_size = 128 
loss = CrossEntropyLoss()
optimizer = SGD
max_epochs =  50 
lrn_rate = 0.001 

Starting train with saved checkpoints


epoch =    0   loss = 85.6776 valid_data accuracy = 0.0303


epoch =   10   loss = 78.9780 valid_data accuracy = 0.0545


epoch =   20   loss = 73.7288 valid_data accuracy = 0.2242


epoch =   30   loss = 69.2402 valid_data accuracy = 0.2788


epoch =   40   loss = 65.2908 valid_data accuracy = 0.3333


epoch =   50   loss = 61.9270 valid_data accuracy = 0.3394


In [19]:
from manifold.clients.python import ManifoldClient
from datetime import timedelta
import logging
import pandas as pd
from io import BytesIO

In [20]:
torch.save(net.state_dict(), EMBEDDING_DATA_NAME + '_model.pt')

In [23]:
torch.save(updated_api2indx, EMBEDDING_DATA_NAME + '_api2indx.pt')

In [None]:
# Run this cell if you need to load saved state_dict
api2indx = torch.load(EMBEDDING_DATA_NAME + "_api2indx.pt")
net = Net().to(device)
checkpoint = torch.load(EMBEDDING_DATA_NAME + "_model.pt")
net.load_state_dict(checkpoint)
net.eval()

In [None]:
# 4. evaluate model accuracy
print("\nComputing model accuracy")
net.eval()
acc_train = accuracy(net, train_ds)  # item-by-item
print("Accuracy on training data = %0.4f" % acc_train)
acc_valid = accuracy(net, valid_ds)  # item-by-item
print("Accuracy on validation data = %0.4f" % acc_valid)
acc_test = accuracy(net, test_ds)  # en masse
print("Accuracy on test data = %0.4f" % acc_test)

In [None]:
def accuracy_by_api(model, ds):
  ground_truth = {}
  correct = {}
  wrong = {}
  error_pair = {}

  global api2indx

  ground_truth = dict.fromkeys(api2indx, 0)
  correct = dict.fromkeys(api2indx, 0)
  wrong = dict.fromkeys(api2indx, 0)  
  
  n_correct = 0; n_wrong = 0

  for i in range(len(ds)):
    X = ds[i]['predictors'].to(device)
    Y = ds[i]['targets'].to(device) 
    ground_truth_api = list(api2indx.keys())[list(api2indx.values()).index(Y.item())]
    ground_truth[ground_truth_api] += 1

    with T.no_grad():
      oupt, z3, z2, z1  = model(X)  # logits form

    prediction = T.argmax(oupt) 
    prediction_item = prediction.item()
    prediction_api = list(api2indx.keys())[list(api2indx.values()).index(prediction_item)]

    if prediction == Y:
      n_correct += 1
      correct[prediction_api] += 1
    else:
      n_wrong += 1
      wrong[ground_truth_api] += 1
      ep = (ground_truth_api , prediction_api)
      if error_pair.get(ep, -1) == -1:
          error_pair[ep] = 0
      error_pair[ep] += 1


  error_per_api = dict.fromkeys(api2indx, 0)
  for key, correct_val in correct.items():

      gt = ground_truth[key]
      wrong_val = wrong[key]
      pcntg = 0
      total = wrong_val + correct_val

      if total != 0:
        pcntg = correct_val/total
      
      assert(gt == total)

      error_per_api[key] = (correct_val,wrong_val, 1-pcntg)
      print("{:18s}: correct_val = {:5d}, wrong_val = {:5d}, percentage = {:.2f}, total = {:6d}".format(key, correct_val, wrong_val, pcntg, total))
  
  
  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  print("accuracy: %0.4f" % acc)
  return (error_per_api, error_pair)

error_per_api, err_dict = accuracy_by_api(net, test_ds)

# Figure 6 - Confusing Pairs from Classification Model

In [None]:
# print confusing pairs
sorted_err_dict = {k: v for k, v in sorted(err_dict.items(), reverse=True, key=lambda item: item[1])}
print("\n".join(["{}\t{}".format(k, v) for k, v in sorted_err_dict.items()]))

In [None]:
import matplotlib.pyplot as plt

pairs = list(["_".join(k) for k in sorted_err_dict.keys()])
num = list(sorted_err_dict.values())
  
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(pairs, num, color ='red', width = 0.6)
 
plt.xlabel("Confused Pairs")
plt.xticks(rotation = 90, fontsize=15)
plt.ylabel("Number")
plt.xlim([0, 40])
plt.title("Number of Confused Pairs")
plt.show()

In [75]:
# Store embeddings (z3) of each input-output in the test dataset
def store_embeddings_for_tsne(model, ds):
  global api2indx
  model_result = []

  for i in range(len(ds)):
    X = ds[i]['predictors'].to(device)
    Y = ds[i]['targets'].to(device) 
    ground_truth_api = list(api2indx.keys())[list(api2indx.values()).index(Y.item())]

    with T.no_grad():
      oupt, z3, z2, z1 = model(X)  # logits form
      
    prediction_idx = T.argmax(oupt) 
    prediction_idx_item = prediction_idx.item()
    prediction_idx_api = list(api2indx.keys())[list(api2indx.values()).index(prediction_idx_item)]
    
    model_result.append((ground_truth_api, prediction_idx_api, z3.cpu()))

  return(model_result)

model_result = store_embeddings_for_tsne(net, test_ds)
torch.save(model_result, EMBEDDING_DATA_NAME + '.tSNE.pt')