# Global settings

In [None]:
# References:
# This source code file refers to:
# https://github.com/microsoft/CodeBERT
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [1]:
ONLY_TESTING = False
DATASET_NAME = 'ffmpeg'

BATCH_SIZE = 4
EPOCHS = 10
LR = 1e-6

In [2]:
import os
import random
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
import torch.nn.functional as F
import shutil

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

DATASET_ROOT_PATH = '/root/autodl-tmp'
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_NAME}'
MODEL_SAVE_PATH = f'{DATASET_ROOT_PATH}/finetuned_models/{DATASET_NAME}'

BERT_CONFIG = 'microsoft/codebert-base' # microsoft/codebert-base
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())

seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/ffmpeg
DATASET_PATH: /root/autodl-tmp/output_dataset_1/ffmpeg
BERT_CONFIG: microsoft/codebert-base
ONLY_TESTING: False
DATASET_NAME: ffmpeg
BATCH_SIZE: 4
EPOCHS: 10
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

def remove_file_if_exist(path):
    if not path: return
    if os.path.exists(path):
        try:
            os.remove(path)
        except:
            shutil.rmtree(path)

mkdir_if_not_exist(f'{DATASET_ROOT_PATH}/finetuned_models')
# remove_file_if_exist(MODEL_SAVE_PATH)
mkdir_if_not_exist(MODEL_SAVE_PATH)
tokenizer = RobertaTokenizer.from_pretrained(BERT_CONFIG)

def tokenize_helper(x):
    code_tokens = tokenizer.tokenize(x)
    tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    tokens_ids = torch.tensor(tokens_ids)
    
    number_to_pad = 512 - len(tokens_ids)
    if number_to_pad > 0:
        zero_pad = torch.zeros(512 - len(tokens_ids), dtype=int)
        tokens_ids = torch.cat((tokens_ids, zero_pad), 0)

    return tokens_ids[:512]

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [label for label in df['label']]
        self.texts = [tokenize_helper(text) for text in df['commit_patch']]
        assert(len(self.labels) == len(self.texts))

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

class CodeBertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(CodeBertClassifier, self).__init__()

        self.codebert = RobertaModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(768, 768)
        self.out = nn.Linear(768, 2)

    def forward(self, x):
        x = self.codebert(x)[1]
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.out(x)
        return x

    def check_parameters(self):
        print('The number of CodeBert parameters:', self.codebert.num_parameters())

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            train_input = train_input.to(device)

            output = model(train_input)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                # val_input = val_input.squeeze(1).to(device)
                val_input = val_input.to(device)

                output = model(val_input)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

        bert_config = 'base'
        val_acc = f'{total_acc_val / len(val_data):.3f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/codebert_{bert_config}_mlp_eachDropout_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            # test_input = test_input.squeeze(1).to(device)
            test_input = test_input.to(device)

            output = model(test_input)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(report)
    print(confusion)


# Training

In [4]:
if not ONLY_TESTING:
    model = CodeBertClassifier()
    model.check_parameters()
    model.to(device)

    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    df_test = pd.read_json(f'{DATASET_PATH}/test.json')
    train(model, df_train, df_val, LR, EPOCHS)
    evaluate(model, df_test)


The number of CodeBert parameters: 124645632


100%|██████████| 2612/2612 [03:41<00:00, 11.79it/s]


Epochs: 1 | Train Loss:  0.169             | Train Accuracy:  0.573             | Val Loss:  0.168             | Val Accuracy:  0.574


100%|██████████| 2612/2612 [03:43<00:00, 11.71it/s]


Epochs: 2 | Train Loss:  0.166             | Train Accuracy:  0.574             | Val Loss:  0.163             | Val Accuracy:  0.596


100%|██████████| 2612/2612 [03:42<00:00, 11.72it/s]


Epochs: 3 | Train Loss:  0.162             | Train Accuracy:  0.611             | Val Loss:  0.160             | Val Accuracy:  0.613


100%|██████████| 2612/2612 [03:42<00:00, 11.73it/s]


Epochs: 4 | Train Loss:  0.158             | Train Accuracy:  0.626             | Val Loss:  0.159             | Val Accuracy:  0.614


100%|██████████| 2612/2612 [03:42<00:00, 11.72it/s]


Epochs: 5 | Train Loss:  0.155             | Train Accuracy:  0.647             | Val Loss:  0.158             | Val Accuracy:  0.621


100%|██████████| 2612/2612 [03:42<00:00, 11.72it/s]


Epochs: 6 | Train Loss:  0.149             | Train Accuracy:  0.668             | Val Loss:  0.156             | Val Accuracy:  0.645


100%|██████████| 2612/2612 [03:42<00:00, 11.73it/s]


Epochs: 7 | Train Loss:  0.143             | Train Accuracy:  0.693             | Val Loss:  0.157             | Val Accuracy:  0.648


100%|██████████| 2612/2612 [03:42<00:00, 11.72it/s]


Epochs: 8 | Train Loss:  0.134             | Train Accuracy:  0.724             | Val Loss:  0.161             | Val Accuracy:  0.648


100%|██████████| 2612/2612 [03:42<00:00, 11.72it/s]


Epochs: 9 | Train Loss:  0.123             | Train Accuracy:  0.760             | Val Loss:  0.167             | Val Accuracy:  0.647


100%|██████████| 2612/2612 [03:42<00:00, 11.74it/s]


Epochs: 10 | Train Loss:  0.111             | Train Accuracy:  0.796             | Val Loss:  0.178             | Val Accuracy:  0.645
Test Accuracy:  0.645
              precision    recall  f1-score   support

      benign     0.6776    0.7258    0.7009      1995
  vulnerable     0.5939    0.5373    0.5642      1489

    accuracy                         0.6452      3484
   macro avg     0.6357    0.6315    0.6325      3484
weighted avg     0.6418    0.6452    0.6424      3484

[[1448  547]
 [ 689  800]]


# Testing

In [4]:
check_point_files_list = ['codebert_base_mlp_eachDropout_0.648_ep8', 'codebert_base_mlp_eachDropout_0.648_ep7',
                          'codebert_base_mlp_eachDropout_0.647_ep9', 'codebert_base_mlp_eachDropout_0.645_ep6',
                          'codebert_base_mlp_eachDropout_0.645_ep10']
print('check_point_files_list:', check_point_files_list)

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

df_test = pd.read_json(f'{DATASET_PATH}/val.json')

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')
    check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
    model = CodeBertClassifier()
    model.to(device)
    model.load_state_dict(torch.load(check_point_file))
    evaluate(model, df_test)


check_point_files_list: ['codebert_base_mlp_eachDropout_0.648_ep8', 'codebert_base_mlp_eachDropout_0.648_ep7', 'codebert_base_mlp_eachDropout_0.647_ep9', 'codebert_base_mlp_eachDropout_0.645_ep6', 'codebert_base_mlp_eachDropout_0.645_ep10']

#######################################codebert_base_mlp_eachDropout_0.648_ep8
Test Accuracy:  0.651
              precision    recall  f1-score   support

      benign     0.6804    0.7353    0.7068      1995
  vulnerable     0.6024    0.5373    0.5680      1489

    accuracy                         0.6507      3484
   macro avg     0.6414    0.6363    0.6374      3484
weighted avg     0.6471    0.6507    0.6475      3484

[[1467  528]
 [ 689  800]]

#######################################codebert_base_mlp_eachDropout_0.648_ep7
Test Accuracy:  0.647
              precision    recall  f1-score   support

      benign     0.6815    0.7198    0.7001      1995
  vulnerable     0.5940    0.5494    0.5708      1489

    accuracy                         