# Global settings

In [None]:
# References:
# This source code file refers to:
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://huggingface.co/docs/transformers/model_doc/roberta


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments, RobertaModel

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

MODEL_SAVE_PATH = '/root/autodl-tmp/finetuned_models'
DATASET_ROOT_PATH = '/root/autodl-tmp'
BERT_CONFIG = 'roberta-large' # roberta-base, roberta-large
MODEL_NAME = 'fc'
labels = {0:0, 1:1}

# -----------------------------------------------

ONLY_TESTING = False

DATASET_NAME = 'ffmpeg'
DATASET_MASKING = 'masked_'
# DATASET_MASKING = ''

BATCH_SIZE = 4
EPOCHS = 15
LR = 1e-6


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizerFast.from_pretrained(BERT_CONFIG)
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_MASKING}{DATASET_NAME}'
MODEL_SAVE_PATH = f'{MODEL_SAVE_PATH}/{DATASET_MASKING}{DATASET_NAME}'

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)
print('DATASET_MASKING:', DATASET_MASKING)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())


seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/masked_ffmpeg
DATASET_PATH: /root/autodl-tmp/output_dataset_1/masked_ffmpeg
BERT_CONFIG: roberta-large
ONLY_TESTING: False
DATASET_NAME: ffmpeg
DATASET_MASKING: masked_
BATCH_SIZE: 4
EPOCHS: 15
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

mkdir_if_not_exist(MODEL_SAVE_PATH)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = RobertaModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        if BERT_CONFIG == 'roberta-large':
            self.linear = nn.Linear(1024, len(labels))
        else:
            self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

    def check_parameters(self):
        print('The number of Bert parameters:', self.bert.num_parameters())

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

        bert_config = 'large' if BERT_CONFIG == 'roberta-large' else 'base'
        val_acc = f'{total_acc_val / len(val_data):.3f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/roberta_{bert_config}_{MODEL_NAME}_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(report)
    print(confusion)


# Train the model

In [4]:
if not ONLY_TESTING:
    model = BertClassifier()
    model.to(device)
    
    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    df_test = pd.read_json(f'{DATASET_PATH}/test.json')
    train(model, df_train, df_val, LR, EPOCHS)
    evaluate(model, df_test)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2612/2612 [10:32<00:00,  4.13it/s]


Epochs: 1 | Train Loss:  0.131             | Train Accuracy:  0.697             | Val Loss:  0.095             | Val Accuracy:  0.771


100%|██████████| 2612/2612 [10:34<00:00,  4.12it/s]


Epochs: 2 | Train Loss:  0.086             | Train Accuracy:  0.796             | Val Loss:  0.086             | Val Accuracy:  0.788


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 3 | Train Loss:  0.077             | Train Accuracy:  0.808             | Val Loss:  0.089             | Val Accuracy:  0.777


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 4 | Train Loss:  0.073             | Train Accuracy:  0.812             | Val Loss:  0.089             | Val Accuracy:  0.792


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 5 | Train Loss:  0.071             | Train Accuracy:  0.813             | Val Loss:  0.093             | Val Accuracy:  0.792


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 6 | Train Loss:  0.070             | Train Accuracy:  0.819             | Val Loss:  0.091             | Val Accuracy:  0.799


100%|██████████| 2612/2612 [10:34<00:00,  4.12it/s]


Epochs: 7 | Train Loss:  0.071             | Train Accuracy:  0.818             | Val Loss:  0.086             | Val Accuracy:  0.803


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 8 | Train Loss:  0.071             | Train Accuracy:  0.814             | Val Loss:  0.092             | Val Accuracy:  0.785


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 9 | Train Loss:  0.071             | Train Accuracy:  0.815             | Val Loss:  0.087             | Val Accuracy:  0.796


100%|██████████| 2612/2612 [10:34<00:00,  4.11it/s]


Epochs: 10 | Train Loss:  0.070             | Train Accuracy:  0.816             | Val Loss:  0.088             | Val Accuracy:  0.794


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 11 | Train Loss:  0.070             | Train Accuracy:  0.820             | Val Loss:  0.086             | Val Accuracy:  0.801


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 12 | Train Loss:  0.069             | Train Accuracy:  0.821             | Val Loss:  0.087             | Val Accuracy:  0.798


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 13 | Train Loss:  0.070             | Train Accuracy:  0.815             | Val Loss:  0.088             | Val Accuracy:  0.799


100%|██████████| 2612/2612 [10:33<00:00,  4.12it/s]


Epochs: 14 | Train Loss:  0.071             | Train Accuracy:  0.810             | Val Loss:  0.088             | Val Accuracy:  0.795


100%|██████████| 2612/2612 [10:34<00:00,  4.12it/s]


Epochs: 15 | Train Loss:  0.070             | Train Accuracy:  0.818             | Val Loss:  0.091             | Val Accuracy:  0.798
Test Accuracy:  0.788
              precision    recall  f1-score   support

      benign     0.7939    0.8516    0.8218      1995
  vulnerable     0.7798    0.7038    0.7399      1489

    accuracy                         0.7885      3484
   macro avg     0.7868    0.7777    0.7808      3484
weighted avg     0.7879    0.7885    0.7868      3484

[[1699  296]
 [ 441 1048]]


# Test the model

In [4]:
check_point_files_list = ['roberta_large_fc_0.803_ep7', 'roberta_large_fc_0.801_ep11',
                          'roberta_large_fc_0.799_ep6', 'roberta_large_fc_0.799_ep13',
                          'roberta_large_fc_0.798_ep15', 'roberta_large_fc_0.798_ep12',
                          'roberta_large_fc_0.796_ep9', 'roberta_large_fc_0.795_ep14',
                          'roberta_large_fc_0.794_ep10']
print('check_point_files_list:', check_point_files_list)

df_test = pd.read_json(f'{DATASET_PATH}/val.json')

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')
    check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
    model = BertClassifier()
    model.eval()
    model.to(device)
    model.load_state_dict(torch.load(check_point_file))
    evaluate(model, df_test)


check_point_files_list: ['roberta_large_fc_0.803_ep7', 'roberta_large_fc_0.801_ep11(masked_ffmpeg_msgTF)', 'roberta_large_fc_0.799_ep6', 'roberta_large_fc_0.799_ep13', 'roberta_large_fc_0.798_ep15', 'roberta_large_fc_0.798_ep12', 'roberta_large_fc_0.796_ep9', 'roberta_large_fc_0.795_ep14', 'roberta_large_fc_0.794_ep10']

#######################################roberta_large_fc_0.803_ep7


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.816
              precision    recall  f1-score   support

      benign     0.7595    0.9940    0.8611      1995
  vulnerable     0.9863    0.5782    0.7290      1489

    accuracy                         0.8163      3484
   macro avg     0.8729    0.7861    0.7950      3484
weighted avg     0.8564    0.8163    0.8046      3484

[[1983   12]
 [ 628  861]]

#######################################roberta_large_fc_0.801_ep11(masked_ffmpeg_msgTF)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.817
              precision    recall  f1-score   support

      benign     0.7609    0.9920    0.8612      1995
  vulnerable     0.9819    0.5823    0.7310      1489

    accuracy                         0.8169      3484
   macro avg     0.8714    0.7871    0.7961      3484
weighted avg     0.8553    0.8169    0.8056      3484

[[1979   16]
 [ 622  867]]

#######################################roberta_large_fc_0.799_ep6


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.813
              precision    recall  f1-score   support

      benign     0.7584    0.9895    0.8586      1995
  vulnerable     0.9762    0.5776    0.7257      1489

    accuracy                         0.8134      3484
   macro avg     0.8673    0.7835    0.7922      3484
weighted avg     0.8514    0.8134    0.8018      3484

[[1974   21]
 [ 629  860]]

#######################################roberta_large_fc_0.799_ep13


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.816
              precision    recall  f1-score   support

      benign     0.7608    0.9900    0.8604      1995
  vulnerable     0.9775    0.5829    0.7303      1489

    accuracy                         0.8160      3484
   macro avg     0.8691    0.7865    0.7954      3484
weighted avg     0.8534    0.8160    0.8048      3484

[[1975   20]
 [ 621  868]]

#######################################roberta_large_fc_0.798_ep15


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.816
              precision    recall  f1-score   support

      benign     0.7610    0.9895    0.8603      1995
  vulnerable     0.9764    0.5836    0.7306      1489

    accuracy                         0.8160      3484
   macro avg     0.8687    0.7865    0.7954      3484
weighted avg     0.8531    0.8160    0.8049      3484

[[1974   21]
 [ 620  869]]

#######################################roberta_large_fc_0.798_ep12


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.817
              precision    recall  f1-score   support

      benign     0.7611    0.9915    0.8611      1995
  vulnerable     0.9808    0.5829    0.7313      1489

    accuracy                         0.8169      3484
   macro avg     0.8709    0.7872    0.7962      3484
weighted avg     0.8550    0.8169    0.8056      3484

[[1978   17]
 [ 621  868]]

#######################################roberta_large_fc_0.796_ep9


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.815
              precision    recall  f1-score   support

      benign     0.7596    0.9915    0.8602      1995
  vulnerable     0.9807    0.5796    0.7286      1489

    accuracy                         0.8154      3484
   macro avg     0.8701    0.7855    0.7944      3484
weighted avg     0.8541    0.8154    0.8039      3484

[[1978   17]
 [ 626  863]]

#######################################roberta_large_fc_0.795_ep14


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.816
              precision    recall  f1-score   support

      benign     0.7610    0.9895    0.8603      1995
  vulnerable     0.9764    0.5836    0.7306      1489

    accuracy                         0.8160      3484
   macro avg     0.8687    0.7865    0.7954      3484
weighted avg     0.8531    0.8160    0.8049      3484

[[1974   21]
 [ 620  869]]

#######################################roberta_large_fc_0.794_ep10


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.815
              precision    recall  f1-score   support

      benign     0.7595    0.9895    0.8594      1995
  vulnerable     0.9763    0.5803    0.7279      1489

    accuracy                         0.8146      3484
   macro avg     0.8679    0.7849    0.7936      3484
weighted avg     0.8522    0.8146    0.8032      3484

[[1974   21]
 [ 625  864]]
