In [1]:
# References:
# This source code file refers to:
# https://github.com/ICL-ml4csec/VulBERTa
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [None]:
ONLY_TEST = True
DATASET_NAME = 'qemu'
COMMIT_PATCH_PROCESSING_MODE = 1


## Pre-requisites stuff

In [2]:
# --- di
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from tqdm import tqdm
import sys
# --- di

import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import os
import random
import custom
import models
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from custom import CustomDataCollatorForLanguageModeling

## Set default device (GPU or CPU)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[0]
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Weights and Biases flags

os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Pretrain word-level VulBERTa on Draper'
#os.environ['WANDB_NAME'] = 'linux'
#os.environ['WANDB_PROJECT'] = 'projectName'

## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


cuda


## Dataset

In [3]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

commit_patch_train_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/train.json'
commit_patch_val_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/val.json'

if ONLY_TEST:
    m2 = pd.read_json(commit_patch_val_path)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)
else:
    m1 = pd.read_json(commit_patch_train_path)
    m2 = pd.read_json(commit_patch_val_path)

    train_encodings = my_tokenizer.encode_batch(m1.commit_patch)
    train_encodings = process_encodings(train_encodings)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)

class MyCustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) ==  len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

if ONLY_TEST:
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
else:
    train_dataset = MyCustomDataset(train_encodings, m1.label.tolist())
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
    train_labels = m1.label.tolist()


## Train the model

In [4]:
## Pre-trained RoBERTa
pretrained_model_path = '/root/autodl-tmp/VulBERTa/'
model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path)
print(model.num_parameters())

try:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.label.tolist())
except:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.target.tolist())
    
c_weights = torch.FloatTensor([cw[0], cw[1]])

#criterion = torch.nn.CrossEntropyLoss() 
criterion = torch.nn.CrossEntropyLoss(weight=c_weights)
criterion.to(device)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs["logits"]
        #logits = outputs[0]        #### USE THIS IF CNN OR LSTM VURLBERTA
        loss = criterion(logits,labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
        output_dir=f"/root/autodl-tmp/finetuned_models/VB_MLP_{DATASET_NAME}_preprocessMode{COMMIT_PATCH_PROCESSING_MODE}",
        overwrite_output_dir=False,
        per_device_train_batch_size=4,
        num_train_epochs=10,
        evaluation_strategy='epoch',
        save_total_limit=10,
        seed=seed,
        learning_rate=5e-06,
        fp16=True,
        report_to=None,
        load_best_model_at_end =True
)

trainer = MyTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset 
)

trainer.train()


Some weights of the model checkpoint at /root/autodl-tmp/VulBERTa/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/VulBERTa/ and are newly initialized: ['classifier.dense.weight', 'classifier.dense.

124836866


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.6617,0.667742,48.548,61.156
2,0.632,0.650779,48.5513,61.152
3,0.5948,0.734843,48.5251,61.185
4,0.5246,0.928336,48.5836,61.111
5,0.4682,1.603874,48.4628,61.263
6,0.4565,1.977533,48.6033,61.086
7,0.374,2.224256,48.561,61.14
8,0.2318,2.512748,48.5944,61.098
9,0.2045,2.720431,48.5738,61.123
10,0.1264,2.814312,49.0236,60.563


TrainOutput(global_step=22270, training_loss=0.42043715015783084, metrics={'train_runtime': 6604.6626, 'train_samples_per_second': 3.372, 'total_flos': 6.830881558093824e+16, 'epoch': 10.0, 'init_mem_cpu_alloc_delta': 278380, 'init_mem_gpu_alloc_delta': 499356672, 'init_mem_cpu_peaked_delta': 18258, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1581351, 'train_mem_gpu_alloc_delta': 2028382208, 'train_mem_cpu_peaked_delta': 307561415, 'train_mem_gpu_peaked_delta': 8577718784})

## Test the model

In [4]:
check_point_files_list = ['checkpoint-2227', 'checkpoint-4454', 'checkpoint-6681', 'checkpoint-8908', 'checkpoint-11135',
                          'checkpoint-13362', 'checkpoint-15589', 'checkpoint-17816', 'checkpoint-20043', 'checkpoint-22270']

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')

    finetuned_model_path = f'/root/autodl-tmp/finetuned_models/VB_MLP_{DATASET_NAME}_preprocessMode{COMMIT_PATCH_PROCESSING_MODE}/{check_point_file}'
    model = RobertaForSequenceClassification.from_pretrained(finetuned_model_path, local_files_only=True)

    test_loader = DataLoader(val_dataset, batch_size=128)

    def softmax_accuracy(probs,all_labels):
        def getClass(x):
            return(x.index(max(x)))

        all_labels = all_labels.tolist()
        probs = pd.Series(probs.tolist())
        all_predicted = probs.apply(getClass)
        all_predicted.reset_index(drop=True, inplace=True)
        vc = pd.value_counts(all_predicted == all_labels)
        try:
            acc = vc[1]/len(all_labels)
        except:
            if(vc.index[0]==False):
                acc = 0
            else:
                acc = 1
        return(acc,all_predicted)

    model.to(device)

    all_pred=[]
    all_labels=[]
    all_probs=[]
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            acc_val,pred = softmax_accuracy(torch.nn.functional.softmax(outputs[1],dim=1),labels)
            all_pred += pred.tolist()
            all_labels += labels.tolist()
            all_probs += outputs[1].tolist()

    confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
    print('Confusion matrix: \n',confusion)

    tn, fp, fn, tp = confusion.ravel()
    print('\nTP:',tp)
    print('FP:',fp)
    print('TN:',tn)
    print('FN:',fn)

    probs2=[]
    for x in all_probs:
        probs2.append(x[1])

    ## Performance measure
    print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_pred)))
    print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_pred)))
    print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_pred)))
    print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_pred)))
    print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=all_labels, y_score=probs2)))
    print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=all_labels, y_score=probs2)))
    print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=all_labels, y_pred=all_pred)))




#######################################


100%|██████████| 24/24 [00:41<00:00,  1.74s/it]


Confusion matrix: 
 [[862 876]
 [315 916]]

TP: 916
FP: 876
TN: 862
FN: 315

Accuracy: 0.5988548332771977
Precision: 0.5111607142857143
Recall: 0.7441104792851341
F-measure: 0.6060205094277209
Precision-Recall AUC: 0.5751559125032494
AUC: 0.6650341812348619
MCC: 0.24180085760270628

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.75s/it]


Confusion matrix: 
 [[1211  527]
 [ 541  690]]

TP: 690
FP: 527
TN: 1211
FN: 541

Accuracy: 0.6402829235432805
Precision: 0.5669679539852095
Recall: 0.5605199025182778
F-measure: 0.5637254901960784
Precision-Recall AUC: 0.600655577336531
AUC: 0.6804388734074386
MCC: 0.25773752816304585

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[1267  471]
 [ 564  667]]

TP: 667
FP: 471
TN: 1267
FN: 564

Accuracy: 0.6513977770293028
Precision: 0.5861159929701231
Recall: 0.5418359057676686
F-measure: 0.5631067961165049
Precision-Recall AUC: 0.6056356912068408
AUC: 0.6834732584303275
MCC: 0.2744372153612606

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[936 802]
 [386 845]]

TP: 845
FP: 802
TN: 936
FN: 386

Accuracy: 0.5998652745031997
Precision: 0.5130540376442015
Recall: 0.6864337936636881
F-measure: 0.5872133425990271
Precision-Recall AUC: 0.5881198271063994
AUC: 0.6635562506368375
MCC: 0.22301944565372167

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[811 927]
 [306 925]]

TP: 925
FP: 927
TN: 811
FN: 306

Accuracy: 0.5847086561131695
Precision: 0.4994600431965443
Recall: 0.7514216084484159
F-measure: 0.6000648718780409
Precision-Recall AUC: 0.5897362958420744
AUC: 0.6731595276978777
MCC: 0.2217495566393777

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[1245  493]
 [ 560  671]]

TP: 671
FP: 493
TN: 1245
FN: 560

Accuracy: 0.6453351296732907
Precision: 0.5764604810996563
Recall: 0.545085296506905
F-measure: 0.5603340292275575
Precision-Recall AUC: 0.5812061270468031
AUC: 0.6687336817672348
MCC: 0.26380769591093706

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[1215  523]
 [ 562  669]]

TP: 669
FP: 523
TN: 1215
FN: 562

Accuracy: 0.6345570899292691
Precision: 0.5612416107382551
Recall: 0.5434606011372868
F-measure: 0.5522080066033843
Precision-Recall AUC: 0.5679380355407787
AUC: 0.6528340090433273
MCC: 0.24375607549500272

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]


Confusion matrix: 
 [[1226  512]
 [ 563  668]]

TP: 668
FP: 512
TN: 1226
FN: 563

Accuracy: 0.6379252273492758
Precision: 0.5661016949152542
Recall: 0.5426482534524777
F-measure: 0.5541269182911654
Precision-Recall AUC: 0.5554018700359709
AUC: 0.6443964368878765
MCC: 0.24972315960349398

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.78s/it]


Confusion matrix: 
 [[1219  519]
 [ 568  663]]

TP: 663
FP: 519
TN: 1219
FN: 568

Accuracy: 0.6338834624452677
Precision: 0.5609137055837563
Recall: 0.5385865150284321
F-measure: 0.5495234148363033
Precision-Recall AUC: 0.5456485680412365
AUC: 0.6352988906639845
MCC: 0.24151002721295103

#######################################


100%|██████████| 24/24 [00:42<00:00,  1.77s/it]

Confusion matrix: 
 [[1090  648]
 [ 503  728]]

TP: 728
FP: 648
TN: 1090
FN: 503

Accuracy: 0.6123273829572247
Precision: 0.5290697674418605
Recall: 0.5913891145410236
F-measure: 0.5584963559647104
Precision-Recall AUC: 0.538241174727922
AUC: 0.6359593321361565
MCC: 0.21591419429349923



