# Global settings

In [None]:
# References:
# This source code file refers to:
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://huggingface.co/docs/transformers/model_doc/roberta


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments, RobertaModel

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

MODEL_SAVE_PATH = '/root/autodl-tmp/finetuned_models'
DATASET_ROOT_PATH = '/root/autodl-tmp'
BERT_CONFIG = 'roberta-base' # roberta-base, roberta-large
MODEL_NAME = 'fc'
labels = {0:0, 1:1}

# -----------------------------------------------

ONLY_TESTING = False

DATASET_NAME = 'combined'
# DATASET_MASKING = '_masked'
DATASET_MASKING = ''

BATCH_SIZE = 4
EPOCHS = 10
LR = 1e-6


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizerFast.from_pretrained(BERT_CONFIG)
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_NAME}{DATASET_MASKING}'
MODEL_SAVE_PATH = f'{MODEL_SAVE_PATH}/{DATASET_NAME}{DATASET_MASKING}'

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)
print('DATASET_MASKING:', DATASET_MASKING)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())


seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/combined
DATASET_PATH: /root/autodl-tmp/output_dataset_1/combined
BERT_CONFIG: roberta-base
ONLY_TESTING: False
DATASET_NAME: combined
DATASET_MASKING: 
BATCH_SIZE: 4
EPOCHS: 10
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

mkdir_if_not_exist(MODEL_SAVE_PATH)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = RobertaModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        if BERT_CONFIG == 'roberta-large':
            self.linear = nn.Linear(1024, len(labels))
        else:
            self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output) # IMPO CHANGE
        return linear_output

    def check_parameters(self):
        print('The number of Bert parameters:', self.bert.num_parameters())

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        model.train()
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        model.eval()
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .4f} \
            | Train Accuracy: {total_acc_train / len(train_data): .4f} \
            | Val Loss: {total_loss_val / len(val_data): .4f} \
            | Val Accuracy: {total_acc_val / len(val_data): .4f}')

        bert_config = 'large' if BERT_CONFIG == 'roberta-large' else 'base'
        val_acc = f'{total_acc_val / len(val_data):.4f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/roberta_{bert_config}_{MODEL_NAME}_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .4f}')
    print(report)
    print(confusion)


# Train the model

In [4]:
if not ONLY_TESTING:
    model = BertClassifier()
    model.to(device)

    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    train(model, df_train, df_val, LR, EPOCHS)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 1 | Train Loss:  0.1128             | Train Accuracy:  0.7561             | Val Loss:  0.0592             | Val Accuracy:  0.9128


100%|██████████| 4709/4709 [06:59<00:00, 11.23it/s]


Epochs: 2 | Train Loss:  0.0521             | Train Accuracy:  0.9224             | Val Loss:  0.0464             | Val Accuracy:  0.9391


100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 3 | Train Loss:  0.0371             | Train Accuracy:  0.9493             | Val Loss:  0.0394             | Val Accuracy:  0.9550


100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 4 | Train Loss:  0.0292             | Train Accuracy:  0.9629             | Val Loss:  0.0384             | Val Accuracy:  0.9569


100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 5 | Train Loss:  0.0237             | Train Accuracy:  0.9707             | Val Loss:  0.0333             | Val Accuracy:  0.9650


100%|██████████| 4709/4709 [06:59<00:00, 11.22it/s]


Epochs: 6 | Train Loss:  0.0204             | Train Accuracy:  0.9751             | Val Loss:  0.0294             | Val Accuracy:  0.9684


100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 7 | Train Loss:  0.0177             | Train Accuracy:  0.9798             | Val Loss:  0.0279             | Val Accuracy:  0.9699


100%|██████████| 4709/4709 [06:58<00:00, 11.25it/s]


Epochs: 8 | Train Loss:  0.0144             | Train Accuracy:  0.9833             | Val Loss:  0.0253             | Val Accuracy:  0.9743


100%|██████████| 4709/4709 [06:59<00:00, 11.23it/s]


Epochs: 9 | Train Loss:  0.0111             | Train Accuracy:  0.9876             | Val Loss:  0.0250             | Val Accuracy:  0.9767


100%|██████████| 4709/4709 [06:59<00:00, 11.23it/s]


Epochs: 10 | Train Loss:  0.0089             | Train Accuracy:  0.9906             | Val Loss:  0.0233             | Val Accuracy:  0.9767


# Test the model

In [6]:
check_point_files_list = ['roberta_base_fc_0.9767_ep10', 'roberta_base_fc_0.9767_ep9']
print('check_point_files_list:', check_point_files_list)

df_val = pd.read_json(f'{DATASET_PATH}/val.json')

def test_model(df_dataset):
    for check_point_file in check_point_files_list:
        print(f'\n#######################################{check_point_file}')
        check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
        model = BertClassifier()
        model.to(device)
        model.load_state_dict(torch.load(check_point_file))
        evaluate(model, df_dataset)

print('Testing val dataset:')
test_model(df_val)


check_point_files_list: ['roberta_base_fc_0.9767_ep10', 'roberta_base_fc_0.9767_ep9']
Testing val dataset:

#######################################roberta_base_fc_0.9767_ep10


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.9767
              precision    recall  f1-score   support

      benign     0.9766    0.9774    0.9770      2389
  vulnerable     0.9767    0.9759    0.9763      2322

    accuracy                         0.9767      4711
   macro avg     0.9767    0.9766    0.9766      4711
weighted avg     0.9767    0.9767    0.9767      4711

[[2335   54]
 [  56 2266]]

#######################################roberta_base_fc_0.9767_ep9


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.9767
              precision    recall  f1-score   support

      benign     0.9746    0.9795    0.9770      2389
  vulnerable     0.9788    0.9737    0.9763      2322

    accuracy                         0.9767      4711
   macro avg     0.9767    0.9766    0.9766      4711
weighted avg     0.9767    0.9767    0.9766      4711

[[2340   49]
 [  61 2261]]
