In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import random
import torch
import time
import numpy as np
from gensim.models.word2vec import Word2Vec
from model import BatchProgramClassifier
from torch.autograd import Variable
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable

import re

from pycparser.c_ast import TypeDecl, ArrayDecl
from pycparser import c_ast

In [2]:
root = '/home/david/projects/university/astnn/data/'

word2vec = Word2Vec.load(root+"train/embedding/node_w2v_128").wv
embeddings = np.zeros((word2vec.vectors.shape[0] + 1, word2vec.vectors.shape[1]), dtype="float32")
embeddings[:word2vec.vectors.shape[0]] = word2vec.vectors

HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 104
EPOCHS = 15
BATCH_SIZE = 1
USE_GPU = False
MAX_TOKENS = word2vec.vectors.shape[0]
EMBEDDING_DIM = word2vec.vectors.shape[1]

model = BatchProgramClassifier(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE,
                               USE_GPU, embeddings)
model.load_state_dict(torch.load("/home/david/projects/university/astnn/model.pt"))

<All keys matched successfully>

# Load Data

In [3]:
word2vec = Word2Vec.load('/home/david/projects/university/astnn/data/train/embedding/node_w2v_128').wv
vocab = word2vec.vocab

ast_data = pd.read_pickle(root+'test/test_.pkl')
block_data = pd.read_pickle(root+'test/blocks.pkl')

# Allowed var names

In [4]:
leaf_embed = nn.Sequential(
    model._modules['encoder']._modules['embedding'],
    model._modules['encoder']._modules['W_c']
)

In [5]:
# words we wont allow as variable names
reserved_words = [
    'auto',
    'break',
    'case',
    'char',
    'const',
    'continue',
    'default',
    'do',
    'int',
    'long',
    'register',
    'return',
    'short',
    'sizeof',
    'static',
    'struct',
    'switch',
    'typedef',
    'union',
    'unsigned',
    'void',
    'volatile',
    'while',
    'double',
    'else',
    'enum',
    'extern',
    'float',
    'for',
    'goto',
    'if',
    'printf',
    'scanf',
    'cos',
    'malloc'
]


def allowed_variable(var):
    pattern = re.compile("([a-z]|[A-Z]|_)+([a-z]|[A-Z]|[0-9]|_)*$")
    if (var not in reserved_words) and pattern.match(var):
        return True
    else:
        return False

allowed_variable('scanf')

False

In [6]:
embedding_map = {}

for index in range(len(vocab)):
    if allowed_variable(word2vec.index2word[index]):
        embedding_map[index] = leaf_embed(torch.tensor(index)).detach().numpy()

# Var replace functions

In [7]:
def replace_index(node, old_i, new_i):
    i = node[0]
    if i == old_i:
        result = [new_i]
    else:
        result = [i]
    children = node[1:]
    for child in children:
        result.append(replace_index(child, old_i, new_i))
    return result

def replace_var(x, old_i, new_i):
    mod_blocks = []
    for block in x:
        mod_blocks.append(replace_index(block, old_i, new_i))

    return mod_blocks

# Closest Var functions

In [8]:
def l1_norm(a, b):
    return np.linalg.norm(a-b, 1)

def l2_norm(a, b):
    return np.linalg.norm(a-b)

def cos_sim(a, b):
    return np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def closest_index(embedding, embedding_map, metric):
    embedding = embedding.detach().numpy()
    closest_i = list(embedding_map.keys())[0]
    closest_dist = metric(embedding_map[closest_i], embedding)
    for i, e in embedding_map.items():
        d = metric(embedding_map[i], embedding)
        if d < closest_dist:
            closest_dist = d
            closest_i = i
    return closest_i

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm

# Grad locating functions

In [9]:
def get_embedding(indices, node_list):
    '''
    get the embeddings at the index positions in postorder traversal.
    '''
    res = []
    c = 0
    for i in range(node_list.size(0)):
        if not np.all(node_list[i].detach().numpy() == 0):
            if c in indices:
                res.append(node_list[i])
            c += 1
    return res

def post_order_loc(node, var, res, counter):
    '''
    
    '''
    index = node[0]
    children = node[1:]
    for child in children:
        res, counter = post_order_loc(child, var, res, counter)
    if var == index and (not children):
        res.append(counter) 
#         print(counter, word2vec.index2word[index])
    counter += 1
    return res, counter

def get_grad(x, var_index, node_list):
    grads = []
    for i, block in enumerate(x):
        indices, _ = post_order_loc(block, var_index, [], 0)
        grads += get_embedding(indices, node_list.grad[:, i, :])
        try:
            node_embedding = get_embedding(indices, node_list[:, i, :])[0]
        except:
            pass
        
    if len(grads) < 1:
        return None, None
    grad = torch.stack(grads).sum(dim=0)
    return grad, node_embedding

# Var name finder

In [46]:

class declarationFinder(c_ast.NodeVisitor):
    def __init__(self):
        self.names = set()
    
    def visit_Decl(self, node):
        if type(node.type) in [TypeDecl, ArrayDecl] :
            self.names.add(node.name)

def get_var_names(ast):
    declaration_finder = declarationFinder()
    declaration_finder.visit(ast)
    return declaration_finder.names
    
# get_var_names(x)

# FGSM

with vars ordered and early exit

In [47]:
# def gradient_method(x, n_list, var, epsilon, metric):

#     orig_index = vocab[var].index if var in vocab else MAX_TOKEN

#     grad, node_embedding = get_grad(x, orig_index, n_list)
#     if grad is None:
# #         print("no leaf occurences")
#         return None

#     v = node_embedding.detach().numpy()
#     g = torch.sign(grad).detach().numpy()
    

#     v = v + epsilon * g
#     # get the closest emebedding from our map
#     i = closest_index(v, sampled_embedding_map, metric)
# #         print("orig name:", word2vec.index2word[orig_index], "; new name:", word2vec.index2word[i])
#     if i != orig_index:
#         return replace_var(x, orig_index, i)
#     else:
#         return x

In [48]:
MAX_TOKEN = word2vec.vectors.shape[0]

In [70]:
import time
import datetime

def evaluate(epsilon, limit = None, sort_vars = True):
    ast_count = 0
    var_count = 0

    ast_total = 0
    var_total = 0

    start = time.time()
    for code_id in block_data['id'].tolist():
    #     print(code_id)
        x, ast = block_data['code'][code_id], ast_data['code'][code_id]

        _, orig_pred = torch.max(model([x]).data, 1)
        orig_pred = orig_pred.item()

        # get the grad
        loss_function = torch.nn.CrossEntropyLoss()
        labels = torch.LongTensor([orig_pred])
        output = model([x])
        loss = loss_function(output, Variable(labels))
        loss.backward()
        n_list = model._modules['encoder'].node_list

        var_names = get_var_names(ast)
        success = False
        
        var_weighted = []
        for var in list(var_names):
            
            orig_index = vocab[var].index if var in vocab else MAX_TOKEN
            grad, node_embedding = get_grad(x, orig_index, n_list)
            if grad is not None:
                h = abs((grad @ torch.sign(grad)).item())
                var_weighted.append( (h, grad, node_embedding) )
            
        if sort_vars:
            var_weighted = sorted(var_weighted, key=lambda x: x[0], reverse = True)
        
        for h, grad, node_embedding in var_weighted:
            
            v = node_embedding
            g = torch.sign(grad)


            v = v + epsilon * g
            # get the closest emebedding from our map
            i = closest_index(v, sampled_embedding_map, l1_norm)
            if i != orig_index:
                new_x_l2 = replace_var(x, orig_index, i)
            else:
                new_x_l2 = x
            
            if new_x_l2:
                o = model([new_x_l2])
                _, predicted_l2 = torch.max(o.data, 1)

    #             print(orig_pred, predicted_l2.item())
                var_total += 1
                if orig_pred != predicted_l2.item():
                    var_count += 1
                    success = True
                    break

        if success:
            ast_count += 1
        ast_total += 1


        if ast_total % 500 == 499:
            eval_time = time.time() - start
            eval_time = datetime.timedelta(seconds=eval_time)
            print(ast_total, ";", eval_time, ";", ast_count / ast_total, ";", var_count / var_total)
    
        if limit and limit < ast_total:
            break
    return (1-(ast_count / ast_total), 1-(var_count / var_total))


In [71]:
# sample_rate = 0.2
# sample_count = int(len(embedding_map) * sample_rate)
# sampled_embedding_map = {key: embedding_map[key] for key in random.sample(embedding_map.keys(), sample_count)}

sampled_embedding_map = embedding_map

In [72]:
evaluate(10)

499 ; 0:06:11.082216 ; 0.4709418837675351 ; 0.10638297872340426
999 ; 0:12:42.325317 ; 0.44544544544544545 ; 0.09812568908489526
1499 ; 0:19:17.990759 ; 0.43695797198132086 ; 0.09406864857101824
1999 ; 0:25:45.001663 ; 0.43921960980490243 ; 0.09426669529740177
2499 ; 0:30:51.134010 ; 0.43137254901960786 ; 0.0915188046523474
2999 ; 0:35:45.699827 ; 0.43347782594198064 ; 0.09219204311750939
3499 ; 0:40:01.739350 ; 0.43412403543869676 ; 0.09294499173958269
3999 ; 0:44:42.915889 ; 0.43810952738184544 ; 0.09404186795491143
4499 ; 0:49:08.399550 ; 0.43943098466325853 ; 0.0945752009184845
4999 ; 0:54:01.274330 ; 0.4370874174834967 ; 0.09332820775670596
5499 ; 0:58:34.313150 ; 0.43498817966903075 ; 0.09266648587920816
5999 ; 1:03:06.773592 ; 0.43723953992332054 ; 0.09341833463921932
6499 ; 1:07:48.804006 ; 0.4397599630712417 ; 0.0940564733758968
6999 ; 1:12:44.722604 ; 0.43934847835405055 ; 0.0937871717448989
7499 ; 1:17:23.098371 ; 0.44192559007867716 ; 0.09463974640888712
7999 ; 1:21:41.1975

(0.43966926257090666, 0.09413338822560724)

In [None]:
import time

epsilons = np.linspace(1,100,30)
ast_performances = []
var_performances = []
for e in epsilons:
    start = time.time()
    ast_performance, var_performance = evaluate(e, limit=200)
    eval_time = time.time() - start
    
    ast_performances.append(ast_performance)
    var_performances.append(var_performance)
    print(e, eval_time, ast_performance, var_performance)