In [1]:
# References:
# This source code file refers to:
# https://github.com/ICL-ml4csec/VulBERTa
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [None]:
ONLY_TEST = False
DATASET_NAME = 'ffmpeg'
COMMIT_PATCH_PROCESSING_MODE = 1


## Pre-requisites stuff

In [2]:
# --- di
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from tqdm import tqdm
import sys
# --- di

import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import os
import random
import custom
import models
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset, ConcatDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from custom import CustomDataCollatorForLanguageModeling

## Set default device (GPU or CPU)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[0]
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Weights and Biases flags

os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'
# os.environ["CUDA_VISIBLE_DEVICES"]=""
#os.environ['WANDB_NOTEBOOK_NAME'] = 'Pretrain word-level VulBERTa on Draper'
#os.environ['WANDB_NAME'] = 'linux'
#os.environ['WANDB_PROJECT'] = 'projectName'

## Tokenizer

from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List 

class MyTokenizer:
    
    cidx = cindex.Index.create()
        

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        
## Custom tokenizer

from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE

## Load pre-trained tokenizers
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)

## mkdir directory if not exist
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

MODEL_SAVE_PATH = f'/root/autodl-tmp/finetuned_models/{DATASET_NAME}_{COMMIT_PATCH_PROCESSING_MODE}'
mkdir_if_not_exist('/root/autodl-tmp/finetuned_models')
mkdir_if_not_exist(MODEL_SAVE_PATH)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('Using device:', device)


MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/ffmpeg_1
Using device: cuda


## Dataset

In [3]:
my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

commit_patch_train_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/train.json'
commit_patch_val_path = f'/root/autodl-tmp/output_dataset_{COMMIT_PATCH_PROCESSING_MODE}/{DATASET_NAME}/val.json'

if ONLY_TEST:
    m2 = pd.read_json(commit_patch_val_path)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)
    
else:
    m1 = pd.read_json(commit_patch_train_path)
    m2 = pd.read_json(commit_patch_val_path)

    train_encodings = my_tokenizer.encode_batch(m1.commit_patch)
    train_encodings = process_encodings(train_encodings)

    val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
    val_encodings = process_encodings(val_encodings)

class MyCustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) ==  len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

if ONLY_TEST:
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())
else:
    train_dataset = MyCustomDataset(train_encodings, m1.label.tolist())
    val_dataset = MyCustomDataset(val_encodings, m2.label.tolist())

    train_labels = m1.label.tolist()


## Prepare for training and testing

In [6]:
from torch.optim import Adam
from transformers import RobertaConfig, RobertaModel

try:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.label.tolist())
except:
    cw = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=m1.target.tolist())
    
c_weights = torch.FloatTensor([cw[0], cw[1]])

#criterion = torch.nn.CrossEntropyLoss() 
criterion = torch.nn.CrossEntropyLoss(weight=c_weights)
criterion.to(device)

BATCH_SIZE = 4
EPOCHS = 6
LR = 5e-6

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

BATCH_SIZE: 4
EPOCHS: 6
LR: 5e-06


## Train the model

In [4]:
def train(model, train_data, val_data, learning_rate, epochs):
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = torch.nn.CrossEntropyLoss(weight=c_weights)
    criterion.to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)

    for epoch_num in range(epochs):
        model.train()
        total_acc_train = 0
        total_loss_train = 0

        for item in tqdm(train_dataloader):
            train_label = item['labels'].to(device)
            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = output["logits"]
            output = torch.nn.functional.softmax(output, dim=1)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        model.eval()
        with torch.no_grad():
            for item in val_dataloader:
                val_label = item['labels'].to(device)
                mask = item['attention_mask'].to(device)
                input_id = item['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                output = output["logits"]
                output = torch.nn.functional.softmax(output, dim=1)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .4f} \
            | Train Accuracy: {total_acc_train / len(train_data): .4f} \
            | Val Loss: {total_loss_val / len(val_data): .4f} \
            | Val Accuracy: {total_acc_val / len(val_data): .4f}')

        val_acc = f'{total_acc_val / len(val_data):.4f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/vulberta_{val_acc}_ep{epoch_num + 1}.pt')

pretrained_model_path = '/root/autodl-tmp/VulBERTa/'
model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path)
print(model.num_parameters())

model.to(device)
train(model, train_dataset, val_dataset, LR, EPOCHS)


Some weights of the model checkpoint at /root/autodl-tmp/VulBERTa/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/VulBERTa/ and are newly initialized: ['classifier.dense.weight', 'classifier.dense.

124836866
BATCH_SIZE: 4
EPOCHS: 6
LR: 5e-06


100%|██████████| 2612/2612 [10:55<00:00,  3.98it/s]


Epochs: 1 | Train Loss:  0.1630             | Train Accuracy:  0.6185             | Val Loss:  0.1577             | Val Accuracy:  0.6530


100%|██████████| 2612/2612 [10:58<00:00,  3.97it/s]


Epochs: 2 | Train Loss:  0.1524             | Train Accuracy:  0.6807             | Val Loss:  0.1570             | Val Accuracy:  0.6722


100%|██████████| 2612/2612 [10:56<00:00,  3.98it/s]


Epochs: 3 | Train Loss:  0.1390             | Train Accuracy:  0.7492             | Val Loss:  0.1582             | Val Accuracy:  0.6828


100%|██████████| 2612/2612 [10:54<00:00,  3.99it/s]


Epochs: 4 | Train Loss:  0.1255             | Train Accuracy:  0.8080             | Val Loss:  0.1549             | Val Accuracy:  0.6900


100%|██████████| 2612/2612 [10:54<00:00,  3.99it/s]


Epochs: 5 | Train Loss:  0.1177             | Train Accuracy:  0.8429             | Val Loss:  0.1550             | Val Accuracy:  0.6851


100%|██████████| 2612/2612 [10:53<00:00,  4.00it/s]


Epochs: 6 | Train Loss:  0.1138             | Train Accuracy:  0.8574             | Val Loss:  0.1560             | Val Accuracy:  0.6860


# --------------------------------------

## Test the model

In [7]:
def evaluate(model, test_data):
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for item in tqdm(test_dataloader):
            test_label = item['labels'].to(device)
            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = output["logits"]
            output = torch.nn.functional.softmax(output, dim=1)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = sklearn.metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = sklearn.metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .4f}')
    print(report)
    print(confusion)

saved_model_name = 'vulberta_0.6900_ep4.pt'
saved_ct_model_path = f'{MODEL_SAVE_PATH}/{saved_model_name}'

pretrained_model_path = '/root/autodl-tmp/VulBERTa/'
model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path)
model.to(device)
model.load_state_dict(torch.load(saved_ct_model_path))

print('Testing val dataset:')
evaluate(model, val_dataset)


Some weights of the model checkpoint at /root/autodl-tmp/VulBERTa/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/VulBERTa/ and are newly initialized: ['classifier.dense.weight', 'classifier.dense.

Testing val dataset:


100%|██████████| 871/871 [01:01<00:00, 14.07it/s]

Test Accuracy:  0.6900
              precision    recall  f1-score   support

      benign     0.7220    0.7459    0.7337      1995
  vulnerable     0.6437    0.6152    0.6291      1489

    accuracy                         0.6900      3484
   macro avg     0.6828    0.6805    0.6814      3484
weighted avg     0.6885    0.6900    0.6890      3484

[[1488  507]
 [ 573  916]]



