# Global settings

In [None]:
# References:
# This source code file refers to:
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://huggingface.co/docs/transformers/model_doc/roberta
# https://colab.research.google.com/github/dpressel/dlss-tutorial/blob/master/1_pretrained_vectors.ipynb
# https://github.com/dpressel/mead-baseline


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments, RobertaModel

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

MODEL_SAVE_PATH = '/root/autodl-tmp/finetuned_models'
DATASET_ROOT_PATH = '/root/autodl-tmp'
BERT_CONFIG = 'roberta-large' # roberta-base, roberta-large
MODEL_NAME = 'cnn'
labels = {0:0, 1:1}

# -----------------------------------------------

ONLY_TESTING = False

DATASET_NAME = 'ffmpeg'
# DATASET_MASKING = '_masked'
DATASET_MASKING = ''

BATCH_SIZE = 4
EPOCHS = 15
LR = 1e-6


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizerFast.from_pretrained(BERT_CONFIG)
DATASET_PATH = f'{DATASET_ROOT_PATH}/output_dataset_1/{DATASET_NAME}{DATASET_MASKING}'
MODEL_SAVE_PATH = f'{MODEL_SAVE_PATH}/{DATASET_NAME}{DATASET_MASKING}'

print('seed:', seed)
print('MODEL_SAVE_PATH:', MODEL_SAVE_PATH)
print('DATASET_PATH:', DATASET_PATH)
print('BERT_CONFIG:', BERT_CONFIG)

print('ONLY_TESTING:', ONLY_TESTING)
print('DATASET_NAME:', DATASET_NAME)
print('DATASET_MASKING:', DATASET_MASKING)

print('BATCH_SIZE:', BATCH_SIZE)
print('EPOCHS:', EPOCHS)
print('LR:', LR)

print('using device:', device)
print('GPU count:', torch.cuda.device_count())


seed: 42
MODEL_SAVE_PATH: /root/autodl-tmp/finetuned_models/ffmpeg
DATASET_PATH: /root/autodl-tmp/output_dataset_1/ffmpeg
BERT_CONFIG: roberta-large
ONLY_TESTING: False
DATASET_NAME: ffmpeg
DATASET_MASKING: 
BATCH_SIZE: 4
EPOCHS: 15
LR: 1e-06
using device: cuda
GPU count: 1


# Definition

In [3]:
def mkdir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

mkdir_if_not_exist(MODEL_SAVE_PATH)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

        bert_config = 'large' if BERT_CONFIG == 'roberta-large' else 'base'
        val_acc = f'{total_acc_val / len(val_data):.3f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/roberta_{bert_config}_{MODEL_NAME}_{val_acc}_ep{epoch_num + 1}.pt')

def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

            test_label = test_label.data.cpu().numpy()
            predic = output.argmax(dim=1).data.cpu().numpy()
            labels_all = np.append(labels_all, test_label)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(report)
    print(confusion)


### CNN

In [4]:
import torch.nn.functional as F

class ParallelConv(nn.Module):

    def __init__(self, input_dims, filters, dropout=0.5):
        super().__init__()
        convs = []        
        self.output_dims = sum([t[1] for t in filters])
        for (filter_length, output_dims) in filters:
            pad = filter_length//2
            conv = nn.Sequential(
                nn.Conv1d(input_dims, output_dims, filter_length, padding=pad),
                nn.ReLU()
            )
            convs.append(conv)
        # Add the module so its managed correctly
        self.convs = nn.ModuleList(convs)
        self.conv_drop = nn.Dropout(dropout)

    def forward(self, input_bct):
        mots = []
        for conv in self.convs:
            # In Conv1d, data BxCxT, max over time
            conv_out = conv(input_bct)
            mot, _ = conv_out.max(2)
            mots.append(mot)
        mots = torch.cat(mots, 1)
        return self.conv_drop(mots)

class ConvClassifier(nn.Module):

    def __init__(self, embed_dims,
                 filters=[(2, 100), (3, 100), (4, 100)],
                 dropout=0.5, hidden_units=[]):
        super().__init__()
        self.bert = RobertaModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        self.convs = ParallelConv(embed_dims, filters, dropout)
        
        input_units = self.convs.output_dims
        output_units = self.convs.output_dims
        sequence = []
        for h in hidden_units:
            sequence.append(self.dropout(nn.Linear(input_units, h)))
            input_units = h
            output_units = h
            
        sequence.append(nn.Linear(output_units, 2))
        self.outputs = nn.Sequential(*sequence)

    def forward(self, input_id, mask):
        x, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        embed = self.dropout(x)
        embed = embed.transpose(1, 2).contiguous()
        hidden = self.convs(embed)
        linear = self.outputs(hidden)
        return F.log_softmax(linear, dim=-1)


# Train the model

In [5]:
if not ONLY_TESTING:
    embed_dim = 1024
    model = ConvClassifier(embed_dim)
    model.to(device)
    
    df_train = pd.read_json(f'{DATASET_PATH}/train.json')
    df_val = pd.read_json(f'{DATASET_PATH}/val.json')
    df_test = pd.read_json(f'{DATASET_PATH}/test.json')
    train(model, df_train, df_val, LR, EPOCHS)
    evaluate(model, df_test)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2612/2612 [10:45<00:00,  4.05it/s]


Epochs: 1 | Train Loss:  0.139             | Train Accuracy:  0.699             | Val Loss:  0.050             | Val Accuracy:  0.937


100%|██████████| 2612/2612 [10:45<00:00,  4.04it/s]


Epochs: 2 | Train Loss:  0.032             | Train Accuracy:  0.958             | Val Loss:  0.027             | Val Accuracy:  0.973


100%|██████████| 2612/2612 [10:45<00:00,  4.05it/s]


Epochs: 3 | Train Loss:  0.018             | Train Accuracy:  0.980             | Val Loss:  0.026             | Val Accuracy:  0.974


100%|██████████| 2612/2612 [10:43<00:00,  4.06it/s]


Epochs: 4 | Train Loss:  0.011             | Train Accuracy:  0.987             | Val Loss:  0.021             | Val Accuracy:  0.982


100%|██████████| 2612/2612 [10:43<00:00,  4.06it/s]


Epochs: 5 | Train Loss:  0.006             | Train Accuracy:  0.995             | Val Loss:  0.022             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:48<00:00,  4.03it/s]


Epochs: 6 | Train Loss:  0.003             | Train Accuracy:  0.997             | Val Loss:  0.025             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 7 | Train Loss:  0.002             | Train Accuracy:  0.997             | Val Loss:  0.025             | Val Accuracy:  0.982


100%|██████████| 2612/2612 [10:48<00:00,  4.03it/s]


Epochs: 8 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.032             | Val Accuracy:  0.982


100%|██████████| 2612/2612 [10:48<00:00,  4.03it/s]


Epochs: 9 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.029             | Val Accuracy:  0.985


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 10 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.021             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 11 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.030             | Val Accuracy:  0.985


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 12 | Train Loss:  0.000             | Train Accuracy:  1.000             | Val Loss:  0.029             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 13 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.029             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:48<00:00,  4.03it/s]


Epochs: 14 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.028             | Val Accuracy:  0.984


100%|██████████| 2612/2612 [10:49<00:00,  4.02it/s]


Epochs: 15 | Train Loss:  0.000             | Train Accuracy:  1.000             | Val Loss:  0.028             | Val Accuracy:  0.986
Test Accuracy:  0.985
              precision    recall  f1-score   support

      benign     0.9817    0.9930    0.9873      1995
  vulnerable     0.9905    0.9752    0.9827      1489

    accuracy                         0.9854      3484
   macro avg     0.9861    0.9841    0.9850      3484
weighted avg     0.9854    0.9854    0.9853      3484

[[1981   14]
 [  37 1452]]


# Test the model

In [5]:
check_point_files_list = ['roberta_large_cnn_0.986_ep15', 'roberta_large_cnn_0.985_ep9',
                          'roberta_large_cnn_0.985_ep11', 'roberta_large_cnn_0.984_ep10']
print('check_point_files_list:', check_point_files_list)

df_test = pd.read_json(f'{DATASET_PATH}/val.json')

for check_point_file in check_point_files_list:
    print(f'\n#######################################{check_point_file}')
    check_point_file = f'{MODEL_SAVE_PATH}/{check_point_file}.pt'
    embed_dim = 1024
    model = ConvClassifier(embed_dim)
    model.to(device)
    model.load_state_dict(torch.load(check_point_file))
    evaluate(model, df_test)


check_point_files_list: ['roberta_large_cnn_0.986_ep15', 'roberta_large_cnn_0.985_ep9', 'roberta_large_cnn_0.985_ep11', 'roberta_large_cnn_0.984_ep10']

#######################################roberta_large_cnn_0.986_ep15


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.985
              precision    recall  f1-score   support

      benign     0.9817    0.9930    0.9873      1995
  vulnerable     0.9905    0.9752    0.9827      1489

    accuracy                         0.9854      3484
   macro avg     0.9861    0.9841    0.9850      3484
weighted avg     0.9854    0.9854    0.9853      3484

[[1981   14]
 [  37 1452]]

#######################################roberta_large_cnn_0.985_ep9


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.984
              precision    recall  f1-score   support

      benign     0.9802    0.9915    0.9858      1995
  vulnerable     0.9884    0.9731    0.9807      1489

    accuracy                         0.9836      3484
   macro avg     0.9843    0.9823    0.9833      3484
weighted avg     0.9837    0.9836    0.9836      3484

[[1978   17]
 [  40 1449]]

#######################################roberta_large_cnn_0.985_ep11


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.984
              precision    recall  f1-score   support

      benign     0.9835    0.9880    0.9857      1995
  vulnerable     0.9838    0.9778    0.9808      1489

    accuracy                         0.9836      3484
   macro avg     0.9837    0.9829    0.9833      3484
weighted avg     0.9836    0.9836    0.9836      3484

[[1971   24]
 [  33 1456]]

#######################################roberta_large_cnn_0.984_ep10


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.985
              precision    recall  f1-score   support

      benign     0.9836    0.9895    0.9865      1995
  vulnerable     0.9858    0.9778    0.9818      1489

    accuracy                         0.9845      3484
   macro avg     0.9847    0.9837    0.9842      3484
weighted avg     0.9845    0.9845    0.9845      3484

[[1974   21]
 [  33 1456]]
