# Global settings

In [None]:
# Reference:
# This source code file refers to:
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

MODEL_SAVE_PATH = '/root/autodl-tmp/finetuned_models'
DATASET_ROOT_PATH = '/root/autodl-tmp'
BERT_CONFIG = 'bert-large-cased' # bert-large-cased , bert-base-cased
labels = {0:0, 1:1}

# -----------------------------------------------

ONLY_TESTING = False

MODEL_NAME = 'fc'
DATASET_NAME = 'ffmpeg'
# DATASET_MASKING = 'masked_'
DATASET_MASKING = ''

BATCH_SIZE = 4
EPOCHS = 15
LR = 1e-6


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(BERT_CONFIG)
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_MASKING}{DATASET_NAME}'
MODEL_SAVE_PATH = f'{MODEL_SAVE_PATH}/{DATASET_MASKING}{DATASET_NAME}'

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)
print('DATASET_MASKING:', DATASET_MASKING)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())


seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/ffmpeg
DATASET_PATH: /root/autodl-tmp/output_dataset_1/ffmpeg
BERT_CONFIG: bert-large-cased
ONLY_TESTING: False
DATASET_NAME: ffmpeg
DATASET_MASKING: 
BATCH_SIZE: 4
EPOCHS: 15
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

mkdir_if_not_exist(MODEL_SAVE_PATH)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        if BERT_CONFIG == 'bert-large-cased':
            self.linear = nn.Linear(1024, len(labels))
        else:
            self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output) # IMPO CHANGE
        return linear_output

    def check_parameters(self):
        print('The number of Bert parameters:', self.bert.num_parameters())

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

        bert_config = 'large' if BERT_CONFIG == 'bert-large-cased' else 'base'
        val_acc = f'{total_acc_val / len(val_data):.3f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/bert_{bert_config}_{MODEL_NAME}_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(report)
    print(confusion)


# Train the model

In [4]:
if not ONLY_TESTING:
    model = BertClassifier()
    model.check_parameters()
    model.to(device)
    
    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    df_test = pd.read_json(f'{DATASET_PATH}/test.json')
    train(model, df_train, df_val, LR, EPOCHS)
    evaluate(model, df_test)


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The number of Bert parameters: 333579264


100%|██████████| 2612/2612 [10:32<00:00,  4.13it/s]


Epochs: 1 | Train Loss:  0.127             | Train Accuracy:  0.738             | Val Loss:  0.070             | Val Accuracy:  0.894


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 2 | Train Loss:  0.045             | Train Accuracy:  0.936             | Val Loss:  0.037             | Val Accuracy:  0.951


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 3 | Train Loss:  0.022             | Train Accuracy:  0.974             | Val Loss:  0.026             | Val Accuracy:  0.965


100%|██████████| 2612/2612 [10:34<00:00,  4.12it/s]


Epochs: 4 | Train Loss:  0.012             | Train Accuracy:  0.987             | Val Loss:  0.024             | Val Accuracy:  0.972


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 5 | Train Loss:  0.006             | Train Accuracy:  0.994             | Val Loss:  0.025             | Val Accuracy:  0.975


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 7 | Train Loss:  0.003             | Train Accuracy:  0.997             | Val Loss:  0.030             | Val Accuracy:  0.974


100%|██████████| 2612/2612 [10:33<00:00,  4.13it/s]


Epochs: 8 | Train Loss:  0.002             | Train Accuracy:  0.998             | Val Loss:  0.033             | Val Accuracy:  0.969


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 9 | Train Loss:  0.001             | Train Accuracy:  0.998             | Val Loss:  0.025             | Val Accuracy:  0.978


100%|██████████| 2612/2612 [10:33<00:00,  4.13it/s]


Epochs: 10 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.026             | Val Accuracy:  0.978


100%|██████████| 2612/2612 [10:33<00:00,  4.13it/s]


Epochs: 11 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.028             | Val Accuracy:  0.980


100%|██████████| 2612/2612 [10:33<00:00,  4.13it/s]


Epochs: 12 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.028             | Val Accuracy:  0.982


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 13 | Train Loss:  0.000             | Train Accuracy:  1.000             | Val Loss:  0.032             | Val Accuracy:  0.980


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 14 | Train Loss:  0.000             | Train Accuracy:  1.000             | Val Loss:  0.032             | Val Accuracy:  0.980


100%|██████████| 2612/2612 [10:34<00:00,  4.12it/s]


Epochs: 15 | Train Loss:  0.000             | Train Accuracy:  1.000             | Val Loss:  0.045             | Val Accuracy:  0.977
Test Accuracy:  0.977
              precision    recall  f1-score   support

      benign     0.9669    0.9945    0.9805      1995
  vulnerable     0.9923    0.9543    0.9730      1489

    accuracy                         0.9773      3484
   macro avg     0.9796    0.9744    0.9767      3484
weighted avg     0.9777    0.9773    0.9773      3484

[[1984   11]
 [  68 1421]]


# Test the model

In [5]:
check_point_files_list = ['bert_large_fc_0.982_ep12', 'bert_large_fc_0.980_ep14',
                          'bert_large_fc_0.980_ep13', 'bert_large_fc_0.980_ep11']
print('check_point_files_list:', check_point_files_list)

df_test = pd.read_json(f'{DATASET_PATH}/val.json')

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')
    check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
    model = BertClassifier()
    model.to(device)
    model.load_state_dict(torch.load(check_point_file))
    evaluate(model, df_test)


check_point_files_list: ['bert_large_fc_0.982_ep12', 'bert_large_fc_0.980_ep14', 'bert_large_fc_0.980_ep13', 'bert_large_fc_0.980_ep11']

#######################################bert_large_fc_0.982_ep12


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.981
              precision    recall  f1-score   support

      benign     0.9740    0.9940    0.9839      1995
  vulnerable     0.9917    0.9644    0.9779      1489

    accuracy                         0.9813      3484
   macro avg     0.9828    0.9792    0.9809      3484
weighted avg     0.9816    0.9813    0.9813      3484

[[1983   12]
 [  53 1436]]

#######################################bert_large_fc_0.980_ep14


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.980
              precision    recall  f1-score   support

      benign     0.9744    0.9915    0.9829      1995
  vulnerable     0.9883    0.9651    0.9766      1489

    accuracy                         0.9802      3484
   macro avg     0.9813    0.9783    0.9797      3484
weighted avg     0.9803    0.9802    0.9802      3484

[[1978   17]
 [  52 1437]]

#######################################bert_large_fc_0.980_ep13


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.981
              precision    recall  f1-score   support

      benign     0.9735    0.9935    0.9834      1995
  vulnerable     0.9910    0.9637    0.9772      1489

    accuracy                         0.9808      3484
   macro avg     0.9822    0.9786    0.9803      3484
weighted avg     0.9810    0.9808    0.9807      3484

[[1982   13]
 [  54 1435]]

#######################################bert_large_fc_0.980_ep11


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.980
              precision    recall  f1-score   support

      benign     0.9763    0.9900    0.9831      1995
  vulnerable     0.9863    0.9678    0.9769      1489

    accuracy                         0.9805      3484
   macro avg     0.9813    0.9789    0.9800      3484
weighted avg     0.9806    0.9805    0.9805      3484

[[1975   20]
 [  48 1441]]
