# Global settings

In [None]:
# Reference:
# This source code file refers to: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

MODEL_SAVE_PATH = '/root/autodl-tmp/finetuned_models'
DATASET_ROOT_PATH = '/root/autodl-tmp'
BERT_CONFIG = 'bert-large-cased' # bert-large-cased , bert-base-cased
labels = {0:0, 1:1}

# -----------------------------------------------

ONLY_TESTING = False

MODEL_NAME = 'fc'
DATASET_NAME = 'qemu'
# DATASET_MASKING = 'masked_'
DATASET_MASKING = ''

BATCH_SIZE = 4
EPOCHS = 15
LR = 1e-6


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(BERT_CONFIG)
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_MASKING}{DATASET_NAME}'
MODEL_SAVE_PATH = f'{MODEL_SAVE_PATH}/{DATASET_MASKING}{DATASET_NAME}'

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)
print('DATASET_MASKING:', DATASET_MASKING)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())


seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/qemu
DATASET_PATH: /root/autodl-tmp/output_dataset_1/qemu
BERT_CONFIG: bert-large-cased
ONLY_TESTING: False
DATASET_NAME: qemu
DATASET_MASKING: 
BATCH_SIZE: 4
EPOCHS: 15
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

mkdir_if_not_exist(MODEL_SAVE_PATH)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        if BERT_CONFIG == 'bert-large-cased':
            self.linear = nn.Linear(1024, len(labels))
        else:
            self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output) # IMPO CHANGE
        return linear_output

    def check_parameters(self):
        print('The number of Bert parameters:', self.bert.num_parameters())

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    # train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True) # IMPO CHANGE
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

        bert_config = 'large' if BERT_CONFIG == 'bert-large-cased' else 'base'
        val_acc = f'{total_acc_val / len(val_data):.3f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/bert_{bert_config}_{MODEL_NAME}_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(report)
    print(confusion)


# Train the model

In [4]:
if not ONLY_TESTING:
    model = BertClassifier()
    # model = nn.DataParallel(model) # get stuck when using multiple GPUs on AutoDL
    model.check_parameters()
    model.to(device)
    
    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    df_test = pd.read_json(f'{DATASET_PATH}/test.json')
    train(model, df_train, df_val, LR, EPOCHS)
    evaluate(model, df_test)


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The number of Bert parameters: 333579264


100%|██████████| 2227/2227 [09:02<00:00,  4.10it/s]


Epochs: 1 | Train Loss:  0.156             | Train Accuracy:  0.657             | Val Loss:  0.101             | Val Accuracy:  0.841


100%|██████████| 2227/2227 [09:03<00:00,  4.09it/s]


Epochs: 2 | Train Loss:  0.082             | Train Accuracy:  0.877             | Val Loss:  0.069             | Val Accuracy:  0.897


100%|██████████| 2227/2227 [09:05<00:00,  4.08it/s]


Epochs: 3 | Train Loss:  0.050             | Train Accuracy:  0.936             | Val Loss:  0.066             | Val Accuracy:  0.903


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 4 | Train Loss:  0.032             | Train Accuracy:  0.966             | Val Loss:  0.060             | Val Accuracy:  0.918


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 5 | Train Loss:  0.018             | Train Accuracy:  0.982             | Val Loss:  0.051             | Val Accuracy:  0.938


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 6 | Train Loss:  0.013             | Train Accuracy:  0.989             | Val Loss:  0.056             | Val Accuracy:  0.932


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 7 | Train Loss:  0.008             | Train Accuracy:  0.993             | Val Loss:  0.050             | Val Accuracy:  0.944


 29%|██▉       | 642/2227 [02:37<06:28,  4.08it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 2227/2227 [09:05<00:00,  4.08it/s]


Epochs: 8 | Train Loss:  0.006             | Train Accuracy:  0.995             | Val Loss:  0.052             | Val Accuracy:  0.952


100%|██████████| 2227/2227 [09:03<00:00,  4.09it/s]


Epochs: 9 | Train Loss:  0.004             | Train Accuracy:  0.995             | Val Loss:  0.057             | Val Accuracy:  0.948


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 10 | Train Loss:  0.003             | Train Accuracy:  0.996             | Val Loss:  0.055             | Val Accuracy:  0.947


100%|██████████| 2227/2227 [09:03<00:00,  4.09it/s]


Epochs: 11 | Train Loss:  0.003             | Train Accuracy:  0.996             | Val Loss:  0.068             | Val Accuracy:  0.937


100%|██████████| 2227/2227 [09:03<00:00,  4.09it/s]


Epochs: 12 | Train Loss:  0.003             | Train Accuracy:  0.996             | Val Loss:  0.059             | Val Accuracy:  0.948


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 13 | Train Loss:  0.001             | Train Accuracy:  0.998             | Val Loss:  0.057             | Val Accuracy:  0.959


100%|██████████| 2227/2227 [09:05<00:00,  4.08it/s]


Epochs: 14 | Train Loss:  0.001             | Train Accuracy:  0.998             | Val Loss:  0.060             | Val Accuracy:  0.955


100%|██████████| 2227/2227 [09:04<00:00,  4.09it/s]


Epochs: 15 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.065             | Val Accuracy:  0.960
Test Accuracy:  0.960
              precision    recall  f1-score   support

      benign     0.9509    0.9816    0.9660      1738
  vulnerable     0.9728    0.9285    0.9501      1231

    accuracy                         0.9596      2969
   macro avg     0.9619    0.9551    0.9581      2969
weighted avg     0.9600    0.9596    0.9594      2969

[[1706   32]
 [  88 1143]]


# Test the model

In [5]:
check_point_files_list = ['bert_large_fc_0.960_ep15', 'bert_large_fc_0.959_ep13',
                          'bert_large_fc_0.955_ep14']
print('check_point_files_list:', check_point_files_list)

df_test = pd.read_json(f'{DATASET_PATH}/val.json')

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')
    check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
    model = BertClassifier()
    model.to(device)
    model.load_state_dict(torch.load(check_point_file))
    evaluate(model, df_test)


check_point_files_list: ['bert_large_fc_0.960_ep15', 'bert_large_fc_0.959_ep13', 'bert_large_fc_0.955_ep14']

#######################################bert_large_fc_0.960_ep15


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.960
              precision    recall  f1-score   support

      benign     0.9509    0.9816    0.9660      1738
  vulnerable     0.9728    0.9285    0.9501      1231

    accuracy                         0.9596      2969
   macro avg     0.9619    0.9551    0.9581      2969
weighted avg     0.9600    0.9596    0.9594      2969

[[1706   32]
 [  88 1143]]

#######################################bert_large_fc_0.959_ep13


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.959
              precision    recall  f1-score   support

      benign     0.9469    0.9856    0.9659      1738
  vulnerable     0.9784    0.9220    0.9494      1231

    accuracy                         0.9592      2969
   macro avg     0.9627    0.9538    0.9576      2969
weighted avg     0.9600    0.9592    0.9590      2969

[[1713   25]
 [  96 1135]]

#######################################bert_large_fc_0.955_ep14


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.955
              precision    recall  f1-score   support

      benign     0.9526    0.9718    0.9621      1738
  vulnerable     0.9590    0.9318    0.9452      1231

    accuracy                         0.9552      2969
   macro avg     0.9558    0.9518    0.9537      2969
weighted avg     0.9553    0.9552    0.9551      2969

[[1689   49]
 [  84 1147]]
