In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from dataclasses import dataclass
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [4]:
from linevul.linevul_model import Model
from linevul.linevul_helpers import TextDataset
from linevul.linevul_extra import extract_line_attention, linevul_predict

In [5]:
from vulexp.data_models.reveal_data import Reveal

In [6]:
import os
import pandas as pd
from tqdm.autonotebook import tqdm

In [7]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [8]:
@dataclass
class Args:
    device = device
    n_gpu = n_gpu
    use_non_pretrained_model = False
    block_size = 512
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=27
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512
    gtype = 'cpg'

args = Args()

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
encoder = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True).to(device)
model = Model(encoder, config, tokenizer, args)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
checkpoint = '/workspace/12heads_linevul_model.bin'
state_dict = torch.load(checkpoint)
model.load_state_dict(state_dict, strict=False)
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [11]:
data_dir = '/workspace/data/reveal/'
absolute_path = '/workspace/'
reveal_dataset = Reveal(data_dir, args.gtype, to_undirected=True, seed=args.seed,
                        absolute_path=absolute_path,)
reveal_train, reveal_val, reveal_test = reveal_dataset.generate_train_test()

In [12]:
df = reveal_test.data

In [13]:
# Filter the DataFrame to get rows where 'gt' is 1
filtered_df = df[df['gt'] == 1]
text_data = []

for index, row in filtered_df.iterrows():
    file_path = row['path']
    with open(os.path.join(absolute_path, file_path), 'r') as file:
        data = file.read()
        text_data.append((index, data))

df_text_data = pd.DataFrame(text_data, columns=['id', 'processed_func'])

In [14]:
df_text_data

Unnamed: 0,id,processed_func
0,0,static int execstack2_continue ( i_ctx_t * i_c...
1,1,"static int do_execstack ( i_ctx_t * i_ctx_p , ..."
2,2,static gboolean logcat_dump_text ( wtap_dumper...
3,3,static gchar get_priority ( const guint8 * pri...
4,4,extern int as_mysql_step_start ( mysql_conn_t ...
...,...,...
158,158,"void vp9_fdct4x4_1_c ( const int16_t * input ,..."
159,159,static int compute_rd_thresh_factor ( int qind...
160,160,static void init_bit_trees ( ) {\n init_bit_tr...
161,161,void vp9_iwht4x4_1_add_c ( const int16_t * in ...


In [15]:
from linevul.linevul_helpers import TextDataset, convert_examples_to_features

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        funcs = data_frame["processed_func"].tolist()
        for i in tqdm(range(len(funcs)), desc='ExtendTextDataset'):
            self.examples.append(convert_examples_to_features(funcs[i], 1, tokenizer, args))

In [16]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [34]:
def explain(model, tokenizer, data_frame=None): 
    """ 
        return (sample_idx, lines, n_lines)
    """
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        (input_ids, labels) = mini_batch
        ids = input_ids[0].detach().tolist()
        all_tokens = tokenizer.convert_ids_to_tokens(ids)
        all_tokens = [token.replace("Ġ", "") for token in all_tokens]
        all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
        with torch.no_grad():
            prob, attentions = model(input_ids=input_ids, output_attentions=True)
        lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
        extract_list.append((index, lines_with_score, n_lines, prob))
    return extract_list


In [35]:
extract_list= explain(model, tokenizer, df_text_data)

ExtendTextDataset:   0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

In [36]:
extract_list

[(0,
  [(2,
    'returndo_execstack(i_ctx_p,op->value.boolval,op-1);',
    1287.717628479004),
   (0, 'staticintexecstack2_continue(i_ctx_t*i_ctx_p){', 1140.957109451294),
   (1, 'os_ptrop=osp;', 452.06596183776855),
   (3, '}', 60.458709716796875)],
  4,
  tensor([[9.9995e-01, 4.8333e-05]], device='cuda:0')),
 (0,
  [(16,
    'if(opidx==0||op_def_is_internal(op_index_def(opidx)))r_clear_attrs(rq,a_executable);',
    685.7184839248657),
   (20,
    'constchar*tname=rq->value.pstruct?gs_struct_type_name_string(gs_object_type(imemory,rq->value.pstruct)):"NULL";',
    675.571328163147),
   (21,
    'make_const_string(rq,a_readonly|avm_foreign,strlen(tname),(constbyte*)tname);',
    495.48153591156006),
   (10,
    'if(r_has_type_attrs(rp,t_null,a_executable)&&!include_marks)continue;',
    440.6631965637207),
   (0,
    'staticintdo_execstack(i_ctx_t*i_ctx_p,boolinclude_marks,os_ptrop1){',
    437.7697401046753),
   (12, 'ref_assign_old(op1,rq,rp,"execstack");', 327.3784227371216),
   (9,

In [62]:
new_text_data = []
for i in extract_list:  # (index, lines_with_score, n_lines, prob)
    nlines = i[2]
    top_10 = max(int(nlines * 0.1), 2 )  # if func less than 3 lines, select top 2
    _t = ' \n '.join(item[1] for item in i[1][:top_10])
    new_text_data.append((i[0], _t))
df_new_text_data = pd.DataFrame(new_text_data, columns=['id', 'processed_func'])

In [63]:
df_new_text_data

Unnamed: 0,id,processed_func
0,0,"returndo_execstack(i_ctx_p,op->value.boolval,o..."
1,0,if(opidx==0||op_def_is_internal(op_index_def(o...
2,0,staticgbooleanlogcat_dump_text(wtap_dumper*wdh...
3,0,if(*priority>=(guint8)sizeof(priorities))retur...
4,0,if(!step_ptr->job_ptr->db_index&&((!step_ptr->...
...,...,...
158,0,"voidvp9_fdct4x4_1_c(constint16_t*input,int16_t..."
159,0,"constintq=(int)(pow(vp9_dc_quant(qindex,0)/4.0..."
160,0,"init_bit_tree(cat2,2); \n init_bit_tree(cat3,3);"
161,0,"voidvp9_iwht4x4_1_add_c(constint16_t*in,uint8_..."


In [64]:
new_extract_list= explain(model, tokenizer, df_new_text_data)

ExtendTextDataset:   0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

In [65]:
new_extract_list[0]

(0,
 [(0,
   'returndo_execstack(i_ctx_p,op->value.boolval,op-1);',
   1587.131251335144),
  (1, 'staticintexecstack2_continue(i_ctx_t*i_ctx_p){', 1313.7463874816895)],
 2,
 tensor([[9.9993e-01, 7.1108e-05]], device='cuda:0'))

In [66]:
extract_list[0]

(0,
 [(2,
   'returndo_execstack(i_ctx_p,op->value.boolval,op-1);',
   1287.717628479004),
  (0, 'staticintexecstack2_continue(i_ctx_t*i_ctx_p){', 1140.957109451294),
  (1, 'os_ptrop=osp;', 452.06596183776855),
  (3, '}', 60.458709716796875)],
 4,
 tensor([[9.9995e-01, 4.8333e-05]], device='cuda:0'))

In [77]:
original_pred = [i[3].cpu().numpy()[0][1] for i in extract_list]
explain_pred = [i[3].cpu().numpy()[0][1] for i in new_extract_list]

In [79]:
_fidelity = []
for i in zip(original_pred, explain_pred):
    _fidelity.append(abs(i[0] - i[1]))

In [86]:
from statistics import mean, stdev

mean(_fidelity)

0.03446935

In [87]:
stdev(_fidelity)

AssertionError: 

In [89]:
len(_fidelity)

163

In [91]:
_r = pd.DataFrame(_fidelity, columns=['fidelity'])

In [92]:
_r.describe()

Unnamed: 0,fidelity
count,163.0
mean,0.034469
std,0.174842
min,1e-06
25%,1.2e-05
50%,7.7e-05
75%,0.000453
max,0.995099
