In [1]:
# References:
# This source code file refers to:
# https://github.com/ICL-ml4csec/VulBERTa
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [None]:
ONLY_TEST = True
DATASET_NAME = 'qemu'
COMMIT_PATCH_PROCESSING_MODE = 0


## Pre-requisites stuff

In [2]:
# --- di
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from tqdm import tqdm
import sys
# --- di

import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import os
import random
import custom
import models
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from custom import CustomDataCollatorForLanguageModeling

## Set default device (GPU or CPU)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[0]
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Weights and Biases flags

os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Pretrain word-level VulBERTa on Draper'
#os.environ['WANDB_NAME'] = 'linux'
#os.environ['WANDB_PROJECT'] = 'projectName'

## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


cuda


## Dataset

In [3]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

commit_patch_train_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/train.json'
commit_patch_val_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/val.json'

if ONLY_TEST:
    m2 = pd.read_json(commit_patch_val_path)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)
else:
    m1 = pd.read_json(commit_patch_train_path)
    m2 = pd.read_json(commit_patch_val_path)

    train_encodings = my_tokenizer.encode_batch(m1.commit_patch)
    train_encodings = process_encodings(train_encodings)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)

class MyCustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) ==  len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

if ONLY_TEST:
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
else:
    train_dataset = MyCustomDataset(train_encodings, m1.label.tolist())
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
    train_labels = m1.label.tolist()


## Train the model

In [None]:
## Pre-trained RoBERTa
pretrained_model_path = '/root/autodl-tmp/VulBERTa/'
model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path)
print(model.num_parameters())

try:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.label.tolist())
except:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.target.tolist())
    
c_weights = torch.FloatTensor([cw[0], cw[1]])

#criterion = torch.nn.CrossEntropyLoss() 
criterion = torch.nn.CrossEntropyLoss(weight=c_weights)
criterion.to(device)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs["logits"]
        #logits = outputs[0]        #### USE THIS IF CNN OR LSTM VURLBERTA
        loss = criterion(logits,labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
        output_dir=f"/root/autodl-tmp/finetuned_models/VB_MLP_{DATASET_NAME}_preprocessMode{COMMIT_PATCH_PROCESSING_MODE}",
        overwrite_output_dir=False,
        per_device_train_batch_size=4,
        num_train_epochs=10,
        evaluation_strategy='epoch',
        save_total_limit=10,
        seed=seed,
        learning_rate=5e-06,
        fp16=True,
        report_to=None,
        load_best_model_at_end =True
)

trainer = MyTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset 
)

trainer.train()


## Test the model

In [4]:
check_point_files_list = ['checkpoint-2227', 'checkpoint-4454', 'checkpoint-6681', 'checkpoint-8908', 'checkpoint-11135',
                          'checkpoint-13362', 'checkpoint-15589', 'checkpoint-17816', 'checkpoint-20043', 'checkpoint-22270']

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')

    finetuned_model_path = f'/root/autodl-tmp/finetuned_models/VB_MLP_{DATASET_NAME}_preprocessMode{COMMIT_PATCH_PROCESSING_MODE}/{check_point_file}'
    model = RobertaForSequenceClassification.from_pretrained(finetuned_model_path, local_files_only=True)

    test_loader = DataLoader(val_dataset, batch_size=128)

    def softmax_accuracy(probs,all_labels):
        def getClass(x):
            return(x.index(max(x)))

        all_labels = all_labels.tolist()
        probs = pd.Series(probs.tolist())
        all_predicted = probs.apply(getClass)
        all_predicted.reset_index(drop=True, inplace=True)
        vc = pd.value_counts(all_predicted == all_labels)
        try:
            acc = vc[1]/len(all_labels)
        except:
            if(vc.index[0]==False):
                acc = 0
            else:
                acc = 1
        return(acc,all_predicted)

    model.to(device)

    all_pred=[]
    all_labels=[]
    all_probs=[]
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            acc_val,pred = softmax_accuracy(torch.nn.functional.softmax(outputs[1],dim=1),labels)
            all_pred += pred.tolist()
            all_labels += labels.tolist()
            all_probs += outputs[1].tolist()

    confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
    print('Confusion matrix: \n',confusion)

    tn, fp, fn, tp = confusion.ravel()
    print('\nTP:',tp)
    print('FP:',fp)
    print('TN:',tn)
    print('FN:',fn)

    probs2=[]
    for x in all_probs:
        probs2.append(x[1])

    ## Performance measure
    print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_pred)))
    print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_pred)))
    print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_pred)))
    print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_pred)))
    print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=all_labels, y_score=probs2)))
    print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=all_labels, y_score=probs2)))
    print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=all_labels, y_pred=all_pred)))




#######################################checkpoint-2227


100%|██████████| 24/24 [00:41<00:00,  1.75s/it]


Confusion matrix: 
 [[ 735 1003]
 [ 247  984]]

TP: 984
FP: 1003
TN: 735
FN: 247

Accuracy: 0.578982822499158
Precision: 0.4952189229994967
Recall: 0.7993501218521527
F-measure: 0.6115599751398384
Precision-Recall AUC: 0.5683275454474375
AUC: 0.6624302750483997
MCC: 0.23272391700218606

#######################################checkpoint-4454


100%|██████████| 24/24 [00:42<00:00,  1.76s/it]


Confusion matrix: 
 [[1195  543]
 [ 538  693]]

TP: 693
FP: 543
TN: 1195
FN: 538

Accuracy: 0.6359043448972718
Precision: 0.5606796116504854
Recall: 0.5629569455727051
F-measure: 0.5618159708147548
Precision-Recall AUC: 0.5943364311707018
AUC: 0.6806884669998943
MCC: 0.25038203835236406

#######################################checkpoint-6681


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[1271  467]
 [ 574  657]]

TP: 657
FP: 467
TN: 1271
FN: 574

Accuracy: 0.6493768945772987
Precision: 0.5845195729537367
Recall: 0.5337124289195776
F-measure: 0.5579617834394904
Precision-Recall AUC: 0.6045395661112787
AUC: 0.686588036894981
MCC: 0.2691778871812298

#######################################checkpoint-8908


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[915 823]
 [384 847]]

TP: 847
FP: 823
TN: 915
FN: 384

Accuracy: 0.5934658134051869
Precision: 0.5071856287425149
Recall: 0.6880584890333062
F-measure: 0.5839365735953119
Precision-Recall AUC: 0.5897308034355702
AUC: 0.6679942490644915
MCC: 0.21304454295898012

#######################################checkpoint-11135


100%|██████████| 24/24 [00:42<00:00,  1.79s/it]


Confusion matrix: 
 [[939 799]
 [375 856]]

TP: 856
FP: 799
TN: 939
FN: 375

Accuracy: 0.6045806668912092
Precision: 0.5172205438066465
Recall: 0.6953696181965882
F-measure: 0.5932085932085932
Precision-Recall AUC: 0.5823303255840101
AUC: 0.6662742033337103
MCC: 0.2337313299345745

#######################################checkpoint-13362


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[1050  688]
 [ 444  787]]

TP: 787
FP: 688
TN: 1050
FN: 444

Accuracy: 0.6187268440552375
Precision: 0.5335593220338983
Recall: 0.6393176279447603
F-measure: 0.581670362158167
Precision-Recall AUC: 0.5812318828790624
AUC: 0.6685224152807367
MCC: 0.23988925371523936

#######################################checkpoint-15589


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[1077  661]
 [ 492  739]]

TP: 739
FP: 661
TN: 1077
FN: 492

Accuracy: 0.6116537554732233
Precision: 0.5278571428571428
Recall: 0.6003249390739236
F-measure: 0.5617635879893575
Precision-Recall AUC: 0.5615746882294002
AUC: 0.6507339640790885
MCC: 0.21712333053127836

#######################################checkpoint-17816


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[1170  568]
 [ 534  697]]

TP: 697
FP: 568
TN: 1170
FN: 534

Accuracy: 0.6288312563152577
Precision: 0.5509881422924902
Recall: 0.5662063363119415
F-measure: 0.5584935897435898
Precision-Recall AUC: 0.5640056925224105
AUC: 0.6531995187611184
MCC: 0.23849921252834855

#######################################checkpoint-20043


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[1180  558]
 [ 553  678]]

TP: 678
FP: 558
TN: 1180
FN: 553

Accuracy: 0.6257999326372516
Precision: 0.5485436893203883
Recall: 0.5507717303005687
F-measure: 0.5496554519659507
Precision-Recall AUC: 0.5571581975144058
AUC: 0.6496407067518338
MCC: 0.2295784128404293

#######################################checkpoint-22270


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]

Confusion matrix: 
 [[1040  698]
 [ 465  766]]

TP: 766
FP: 698
TN: 1040
FN: 465

Accuracy: 0.6082856180532166
Precision: 0.523224043715847
Recall: 0.6222583265637693
F-measure: 0.5684601113172543
Precision-Recall AUC: 0.542392561153457
AUC: 0.6429647792592399
MCC: 0.21742711388257585



