# Global settings

In [1]:
# References:
# This source code file refers to:
# https://github.com/ICL-ml4csec/VulBERTa
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://huggingface.co/docs/transformers/model_doc/roberta


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import random
import torch
import numpy as np
import shutil

def write_to_file(text, path, mode='a'): # 'a': append; 'w': overwrite
    with open(path, mode) as f:
        f.write(text)

def mkdir_if_not_exist(directory):
    if not directory: return
    if not os.path.exists(directory):
        os.mkdir(directory)

def remove_file_if_exist(path):
    if not path: return
    if os.path.exists(path):
        try:
            os.remove(path)
        except:
            shutil.rmtree(path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using', device)

# The following randomization refers to: https://github.com/ICL-ml4csec/VulBERTa/blob/main/Finetuning_VulBERTa-MLP.ipynb
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'dryrun'

# -------------------------------------- start

DATASET_NAME = 'combined'
# DATASET_MASKING = 'masked_'
DATASET_MASKING = ''

codeTF_check_point = 'vulberta_ 0.6865_ep2.pt'
msgTF_check_point = 'roberta_base_fc_0.9767_ep9(combined).pt'

# -------------------------------------- end

pretrained_model_path = '/root/autodl-tmp/VulBERTa/'

root_directory = '/root/autodl-tmp'
dataset_directory = f'{root_directory}/output_dataset_1/{DATASET_MASKING}{DATASET_NAME}'
init_train_path = f'{dataset_directory}/train.json'
init_val_path = f'{dataset_directory}/val.json'
init_test_path = f'{dataset_directory}/test.json'
intermediate_directory = f'{root_directory}/intermediate/{DATASET_MASKING}{DATASET_NAME}'
mkdir_if_not_exist(f'{root_directory}/intermediate')
mkdir_if_not_exist(intermediate_directory)

finetuned_ct_model_path = f'{root_directory}/codeTF_check_point/{DATASET_MASKING}{DATASET_NAME}/{codeTF_check_point}'
intermediate_ct_train_path = f'{intermediate_directory}/ct_train.txt'
intermediate_ct_val_path = f'{intermediate_directory}/ct_val.txt'
intermediate_ct_test_path = f'{intermediate_directory}/ct_test.txt'

finetuned_mt_model_path = f'{root_directory}/msgTF_check_point/{DATASET_MASKING}{DATASET_NAME}/{msgTF_check_point}'
intermediate_mt_train_path = f'{intermediate_directory}/mt_train.txt'
intermediate_mt_val_path = f'{intermediate_directory}/mt_val.txt'
intermediate_mt_test_path = f'{intermediate_directory}/mt_test.txt'


using cuda


# CodeTransformer

In [2]:
from tqdm import tqdm
import sys
import pandas as pd
import numpy as np
import csv
import pickle
import re
import torch
import sklearn
import random
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM, RobertaForSequenceClassification
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from transformers.modeling_outputs import SequenceClassifierOutput
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from typing import List
from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE
from sklearn import metrics

# definitions
class MyTokenizer:
    cidx = cindex.Index.create()

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            if spelling == '': continue
            ## Keyword no need
            ## Punctuations no need
            ## Literal all to BPE
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))
        return(tok)

    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)

def process_encodings(encodings):
    input_ids=[]
    attention_mask=[]
    for enc in encodings:
        input_ids.append(enc.ids)
        attention_mask.append(enc.attention_mask)
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

# ------------------------------------------------------------------------------
# tokenize and load dataset
print('Tokenizing dataset...')
vocab, merges = BPE.read_file(vocab="./tokenizer/drapgh-vocab.json", merges="./tokenizer/drapgh-merges.txt")
my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ä")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)

my_tokenizer.enable_truncation(max_length=1024)
my_tokenizer.enable_padding(direction='right', pad_id=1, pad_type_id=0, pad_token='<pad>', length=None, pad_to_multiple_of=None)

m1 = pd.read_json(init_train_path)
m2 = pd.read_json(init_val_path)
# m3 = pd.read_json(init_test_path)

train_encodings = my_tokenizer.encode_batch(m1.commit_patch)
train_encodings = process_encodings(train_encodings)

val_encodings = my_tokenizer.encode_batch(m2.commit_patch)
val_encodings = process_encodings(val_encodings)

# test_encodings = my_tokenizer.encode_batch(m3.commit_patch)
# test_encodings = process_encodings(test_encodings)

print('Done')


Tokenizing dataset...
Done


In [3]:
class MyCustomDataset(Dataset):
    def __init__(self, ids, encodings, labels):
        self.ids = ids
        self.encodings = encodings
        self.labels = labels
        assert len(self.encodings['input_ids']) == len(self.encodings['attention_mask']) == len(self.labels) == len(self.ids)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['ids'] = self.ids[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyCustomDataset(m1.id.tolist(), train_encodings, m1.label.tolist())
val_dataset = MyCustomDataset(m2.id.tolist(), val_encodings, m2.label.tolist())
# test_dataset = MyCustomDataset(m3.id.tolist(), test_encodings, m3.label.tolist())


In [4]:
# ------------------------------------------------------------------------------
# generate intermediate data by CodeTransformer
from sklearn import metrics

print('Generating intermediate data...')
model = RobertaForSequenceClassification.from_pretrained(pretrained_model_path)
model.to(device)
model.load_state_dict(torch.load(finetuned_ct_model_path))

def generate_ct_intermediate_dataset(input_data, intermediate_data_path, evaluate=True):
    data_loader = DataLoader(input_data, batch_size=128)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    total_acc = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)

    model.eval()
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            ids = batch['ids']
            outputs = model(input_ids, attention_mask=attention_mask)
            # outputs['logits'] is equal to outputs[0]
            outputs = outputs['logits']

            probs = torch.nn.functional.softmax(outputs, dim=1).tolist()
            assert(len(probs) == len(labels))
            pred_list = outputs.argmax(dim=1).data.cpu().numpy().tolist()
            for i in range(len(probs)):
                id_ = ids[i]
                prob = probs[i]
                label = int(labels[i])
                pred_ = int(pred_list[i])
                content = '\t'.join([str(i) for i in [id_] + prob + [pred_] + [label]]) + '\n'
                write_to_file(content, intermediate_data_path)

            if evaluate:
                acc = (outputs.argmax(dim=1) == labels).sum().item()
                total_acc += acc

                labels = labels.data.cpu().numpy()
                predic = outputs.argmax(dim=1).data.cpu().numpy()
                labels_all = np.append(labels_all, labels)
                predict_all = np.append(predict_all, predic)

    if evaluate:
        report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        print(f'Test Accuracy: {total_acc / len(input_data): .4f}')
        print(report)
        print(confusion)

remove_file_if_exist(intermediate_ct_train_path)
remove_file_if_exist(intermediate_ct_val_path)

generate_ct_intermediate_dataset(train_dataset, intermediate_ct_train_path, False)
generate_ct_intermediate_dataset(val_dataset, intermediate_ct_val_path)



Generating intermediate data...


Some weights of the model checkpoint at /root/autodl-tmp/VulBERTa/ were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/VulBERTa/ and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense

Test Accuracy:  0.6865
              precision    recall  f1-score   support

      benign     0.6913    0.6898    0.6906      2389
  vulnerable     0.6816    0.6830    0.6823      2322

    accuracy                         0.6865      4711
   macro avg     0.6864    0.6864    0.6864      4711
weighted avg     0.6865    0.6865    0.6865      4711

[[1648  741]
 [ 736 1586]]





# MsgTransformer

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from transformers import RobertaModel, RobertaTokenizerFast
from torch.optim import Adam
from tqdm import tqdm
from sklearn import metrics
from torch.nn.parallel import DistributedDataParallel
import os
import random

# definitions
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

BERT_CONFIG = 'roberta-base'
labels = {0:0, 1:1}
BATCH_SIZE = 128
tokenizer = RobertaTokenizerFast.from_pretrained(BERT_CONFIG)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
                                return_tensors="pt") for text in df['commit_message']]
        self.ids = [id_ for id_ in df['id']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_ids = self.ids[idx]
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_ids, batch_texts, batch_y

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()

        self.bert = RobertaModel.from_pretrained(BERT_CONFIG)
        self.dropout = nn.Dropout(dropout)
        if BERT_CONFIG == 'roberta-large':
            self.linear = nn.Linear(1024, len(labels))
        else:
            self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output) # IMPO CHANGE
        return linear_output

    def check_parameters(self):
        print('The number of Bert parameters:', self.bert.num_parameters())

# ------------------------------------------------------------------------------
# generate intermediate data by MsgTransformer and evaluation
model = BertClassifier()
model.to(device)
model.load_state_dict(torch.load(finetuned_mt_model_path))

def generate_mt_intermediate_dataset(input_data, intermediate_data_path, evaluate=True):
    data_loader = torch.utils.data.DataLoader(Dataset(input_data), batch_size=BATCH_SIZE)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    total_acc = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)

    model.eval()
    with torch.no_grad():
        for ids, texts, labels in tqdm(data_loader):
            labels = labels.to(device)
            masks = texts['attention_mask'].to(device)
            input_ids = texts['input_ids'].squeeze(1).to(device)
            outputs = model(input_ids, masks)

            probs = torch.nn.functional.softmax(outputs, dim=1).tolist()
            assert(len(probs) == len(labels))
            pred_list = outputs.argmax(dim=1).data.cpu().numpy().tolist()
            assert(len(probs) == len(pred_list))
            assert(len(probs) == len(ids))
            for i in range(len(probs)):
                id_ = ids[i]
                prob = probs[i]
                label = int(labels[i])
                pred_ = int(pred_list[i])
                content = '\t'.join([str(i) for i in [id_] + prob + [pred_] + [label]]) + '\n'
                write_to_file(content, intermediate_data_path)

            if evaluate:
                acc = (outputs.argmax(dim=1) == labels).sum().item()
                total_acc += acc

                labels = labels.data.cpu().numpy()
                predic = outputs.argmax(dim=1).data.cpu().numpy()
                labels_all = np.append(labels_all, labels)
                predict_all = np.append(predict_all, predic)

    if evaluate:
        report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        print(f'Test Accuracy: {total_acc / len(input_data): .4f}')
        print(report)
        print(confusion)

df_train = pd.read_json(init_train_path)
df_val = pd.read_json(init_val_path)

remove_file_if_exist(intermediate_mt_train_path)
remove_file_if_exist(intermediate_mt_val_path)

generate_mt_intermediate_dataset(df_train, intermediate_mt_train_path, False)
generate_mt_intermediate_dataset(df_val, intermediate_mt_val_path)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 148/148 [01:43<00:00,  1.43it/s]
100%|██████████| 37/37 [00:26<00:00,  1.42it/s]

Test Accuracy:  0.9767
              precision    recall  f1-score   support

      benign     0.9746    0.9795    0.9770      2389
  vulnerable     0.9788    0.9737    0.9763      2322

    accuracy                         0.9767      4711
   macro avg     0.9767    0.9766    0.9766      4711
weighted avg     0.9767    0.9767    0.9766      4711

[[2340   49]
 [  61 2261]]





# Combine everything into intermediate dataset

In [3]:
import json

intermediate_train_path = f'{intermediate_directory}/train.txt'
intermediate_val_path = f'{intermediate_directory}/val.txt'
intermediate_test_path = f'{intermediate_directory}/test.txt'

def generate_intermediate_dataset(intermediate_mt_data_path, intermediate_ct_data_path, intermediate_data_path, init_data_path):
    with open(intermediate_mt_data_path) as f:
        mt_data_list = f.read().split('\n')
    
    with open(intermediate_ct_data_path) as f:
        ct_data_list = f.read().split('\n')
    
    mt_data_list = mt_data_list[:-1] if not mt_data_list[-1] else mt_data_list
    ct_data_list = ct_data_list[:-1] if not ct_data_list[-1] else ct_data_list

    assert(len(mt_data_list) == len(ct_data_list))

    for i in range(len(mt_data_list)):
        mt_data = mt_data_list[i].split('\t')
        ct_data = ct_data_list[i].split('\t')
        assert(mt_data[0] == ct_data[0])
        assert(mt_data[-1] == ct_data[-1])
        data_id = mt_data[0]
        label = mt_data[-1]
        mt_pred = mt_data[-2]
        ct_pred = ct_data[-2]
        content = '\t'.join([data_id] + mt_data[1:3] + ct_data[1:3] + [str(mt_pred)] + [str(ct_pred)] + [label])
        content = content + '\n' if i < len(mt_data_list) - 1 else content
        write_to_file(content, intermediate_data_path)

remove_file_if_exist(intermediate_train_path)
remove_file_if_exist(intermediate_val_path)

generate_intermediate_dataset(intermediate_mt_train_path, intermediate_ct_train_path, intermediate_train_path, init_train_path)
generate_intermediate_dataset(intermediate_mt_val_path, intermediate_ct_val_path, intermediate_val_path, init_val_path)


# Ensemble learning

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn import metrics

EPOCHS = 80
LR = 1e-6
BATCH_SIZE = 4
intermediate_train_path = f'{intermediate_directory}/train.txt'
intermediate_val_path = f'{intermediate_directory}/val.txt'
# intermediate_test_path = f'{intermediate_directory}/test.txt'
MODEL_SAVE_PATH = f'{root_directory}/ensemble_model/{DATASET_MASKING}{DATASET_NAME}'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(input_dim, 20)
        self.out = nn.Linear(20, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.out(x)
        return x

class MyDataset(Dataset):
    def __init__(self, path):
        with open(path) as f:
            data_list = f.read().split('\n')
        self.labels = [ int(data.split('\t')[-1]) for data in data_list ]
        self.inputs = []
        for data in data_list:
            tmp_list = data.split('\t')[:-1]
            for i in range(1, 5):
                tmp_list[i] = float(tmp_list[i])
            for i in range(5, len(tmp_list)):
                tmp_list[i] = int(tmp_list[i])
            self.inputs.append(tmp_list)
        
        assert(len(self.labels) == len(self.inputs))

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = self.inputs[idx]
        y = self.labels[idx]
        return x[0], x[1], x[2], x[3], x[4], x[5], x[6], y

def train(model, train_dataset, val_dataset):
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = model.to(device)
    optimizer = Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    for epoch_num in range(EPOCHS):
        model.train()
        total_acc_train = 0
        total_loss_train = 0
        for x0, x1, x2, x3, x4, x5, x6, y in tqdm(train_dataloader):
            x = torch.transpose(torch.stack([x1, x2, x3, x4]), 0, 1).float().to(device)
            y = y.to(device)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            total_loss_train += loss.item()

            acc = (y_pred.argmax(dim=1) == y).sum().item()
            total_acc_train += acc

            model.zero_grad()
            loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for x0, x1, x2, x3, x4, x5, x6, y in val_dataloader:
                x = torch.transpose(torch.stack([x1, x2, x3, x4]), 0, 1).float().to(device)
                y = y.to(device)
                y_pred = model(x)

                loss = criterion(y_pred, y)
                total_loss_val += loss.item()

                acc = (y_pred.argmax(dim=1) == y).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .4f} \
            | Train Accuracy: {total_acc_train / len(train_dataset): .4f} \
            | Val Loss: {total_loss_val / len(val_dataset): .4f} \
            | Val Accuracy: {total_acc_val / len(val_dataset): .4f}')

        val_acc = f'{total_acc_val / len(val_dataset):.4f}'
        torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/ensemble2_{val_acc}_epoch{epoch_num + 1}.pt')

def evaluate(model, test_dataset):
    test_dataloader = DataLoader(test_dataset, batch_size=1)
    model = model.to(device)
    
    real_sample_count = 0
    total_acc_test = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    model.eval()
    with torch.no_grad():
        for x0, x1, x2, x3, x4, x5, x6, y in test_dataloader:
            data_id = x0[0]
            # for only-bigvul testing starts ---- 
#             if not data_id.startswith('bigvul_'): continue
            # for only-bigvul testing ends ---- 

            x = torch.transpose(torch.stack([x1, x2, x3, x4]), 0, 1).float().to(device)
            y = y.to(device)
            y_pred = model(x)
            
            acc = (y_pred.argmax(dim=1) == y).sum().item()
            total_acc_test += acc
            
            y = y.data.cpu().numpy()
            predic = y_pred.argmax(dim=1).data.cpu().numpy()
            predic_n = predic[0]
            mt_pred = x5
            ct_pred = x6

            if mt_pred == 0 and ct_pred == 1 and predic_n == 1 and y.item() == 1:
                print('----data_id (CodeTF true positive):', data_id)
            if mt_pred == 1 and ct_pred == 0 and predic_n == 0 and y.item() == 0:
                print('----data_id (CodeTF true negative):', data_id)
            
            if ct_pred == 1 and predic_n == 1 and data_id.startswith('bigvul_'):
                print('data_id (true on bigvul):', data_id)
            
            if data_id.startswith('bigvul_'):
                real_sample_count += 1
            
            labels_all = np.append(labels_all, y)
            predict_all = np.append(predict_all, predic)

    report = metrics.classification_report(labels_all, predict_all, target_names=['benign', 'vulnerable'], digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)
    print(f'Test Accuracy: {total_acc_test / len(test_dataset): .4f}')
    print(report)
    print(confusion)
    
    print('real_sample_count for bigvul:', real_sample_count)

train_dataset = MyDataset(intermediate_train_path)
val_dataset = MyDataset(intermediate_val_path)
# test_dataset = MyDataset(intermediate_test_path)
print('intermediate_val_path:', intermediate_val_path)


intermediate_val_path: /root/autodl-tmp/intermediate/combined/val.txt


In [32]:
remove_file_if_exist(MODEL_SAVE_PATH)
mkdir_if_not_exist(MODEL_SAVE_PATH)
model = MLP(4, 2)
train(model, train_dataset, val_dataset)


100%|██████████| 4709/4709 [00:07<00:00, 611.78it/s]


Epochs: 1 | Train Loss:  0.1425             | Train Accuracy:  0.8825             | Val Loss:  0.1446             | Val Accuracy:  0.8554


100%|██████████| 4709/4709 [00:07<00:00, 604.96it/s]


Epochs: 2 | Train Loss:  0.1401             | Train Accuracy:  0.8893             | Val Loss:  0.1422             | Val Accuracy:  0.8680


100%|██████████| 4709/4709 [00:07<00:00, 599.66it/s]


Epochs: 3 | Train Loss:  0.1373             | Train Accuracy:  0.8964             | Val Loss:  0.1398             | Val Accuracy:  0.8805


100%|██████████| 4709/4709 [00:08<00:00, 584.85it/s]


Epochs: 4 | Train Loss:  0.1346             | Train Accuracy:  0.9032             | Val Loss:  0.1374             | Val Accuracy:  0.8905


100%|██████████| 4709/4709 [00:07<00:00, 606.50it/s]


Epochs: 5 | Train Loss:  0.1318             | Train Accuracy:  0.9129             | Val Loss:  0.1350             | Val Accuracy:  0.9004


100%|██████████| 4709/4709 [00:07<00:00, 608.62it/s]


Epochs: 6 | Train Loss:  0.1292             | Train Accuracy:  0.9228             | Val Loss:  0.1326             | Val Accuracy:  0.9085


100%|██████████| 4709/4709 [00:07<00:00, 589.69it/s]


Epochs: 7 | Train Loss:  0.1266             | Train Accuracy:  0.9300             | Val Loss:  0.1301             | Val Accuracy:  0.9172


100%|██████████| 4709/4709 [00:07<00:00, 596.53it/s]


Epochs: 8 | Train Loss:  0.1238             | Train Accuracy:  0.9415             | Val Loss:  0.1276             | Val Accuracy:  0.9266


100%|██████████| 4709/4709 [00:07<00:00, 596.53it/s]


Epochs: 9 | Train Loss:  0.1210             | Train Accuracy:  0.9546             | Val Loss:  0.1251             | Val Accuracy:  0.9404


100%|██████████| 4709/4709 [00:07<00:00, 592.18it/s]


Epochs: 10 | Train Loss:  0.1181             | Train Accuracy:  0.9616             | Val Loss:  0.1226             | Val Accuracy:  0.9607


100%|██████████| 4709/4709 [00:07<00:00, 600.02it/s]


Epochs: 11 | Train Loss:  0.1155             | Train Accuracy:  0.9630             | Val Loss:  0.1200             | Val Accuracy:  0.9745


100%|██████████| 4709/4709 [00:07<00:00, 622.41it/s]


Epochs: 12 | Train Loss:  0.1127             | Train Accuracy:  0.9635             | Val Loss:  0.1175             | Val Accuracy:  0.9750


100%|██████████| 4709/4709 [00:07<00:00, 614.30it/s]


Epochs: 13 | Train Loss:  0.1099             | Train Accuracy:  0.9697             | Val Loss:  0.1149             | Val Accuracy:  0.9747


100%|██████████| 4709/4709 [00:07<00:00, 599.12it/s]


Epochs: 14 | Train Loss:  0.1071             | Train Accuracy:  0.9719             | Val Loss:  0.1124             | Val Accuracy:  0.9745


100%|██████████| 4709/4709 [00:07<00:00, 600.76it/s]


Epochs: 15 | Train Loss:  0.1042             | Train Accuracy:  0.9732             | Val Loss:  0.1098             | Val Accuracy:  0.9750


100%|██████████| 4709/4709 [00:08<00:00, 573.98it/s]


Epochs: 16 | Train Loss:  0.1015             | Train Accuracy:  0.9749             | Val Loss:  0.1072             | Val Accuracy:  0.9754


100%|██████████| 4709/4709 [00:07<00:00, 603.50it/s]


Epochs: 17 | Train Loss:  0.0989             | Train Accuracy:  0.9773             | Val Loss:  0.1046             | Val Accuracy:  0.9756


100%|██████████| 4709/4709 [00:07<00:00, 594.06it/s]


Epochs: 18 | Train Loss:  0.0957             | Train Accuracy:  0.9796             | Val Loss:  0.1021             | Val Accuracy:  0.9758


100%|██████████| 4709/4709 [00:07<00:00, 595.32it/s]


Epochs: 19 | Train Loss:  0.0931             | Train Accuracy:  0.9787             | Val Loss:  0.0996             | Val Accuracy:  0.9758


100%|██████████| 4709/4709 [00:07<00:00, 589.77it/s]


Epochs: 20 | Train Loss:  0.0904             | Train Accuracy:  0.9804             | Val Loss:  0.0970             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:07<00:00, 598.18it/s]


Epochs: 21 | Train Loss:  0.0877             | Train Accuracy:  0.9808             | Val Loss:  0.0945             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:07<00:00, 599.07it/s]


Epochs: 22 | Train Loss:  0.0851             | Train Accuracy:  0.9828             | Val Loss:  0.0921             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:07<00:00, 588.70it/s]


Epochs: 23 | Train Loss:  0.0822             | Train Accuracy:  0.9828             | Val Loss:  0.0896             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:08<00:00, 582.33it/s]


Epochs: 24 | Train Loss:  0.0797             | Train Accuracy:  0.9835             | Val Loss:  0.0872             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 601.36it/s]


Epochs: 25 | Train Loss:  0.0772             | Train Accuracy:  0.9834             | Val Loss:  0.0848             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 564.24it/s]


Epochs: 26 | Train Loss:  0.0747             | Train Accuracy:  0.9843             | Val Loss:  0.0825             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 576.52it/s]


Epochs: 27 | Train Loss:  0.0720             | Train Accuracy:  0.9850             | Val Loss:  0.0802             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 587.99it/s]


Epochs: 28 | Train Loss:  0.0696             | Train Accuracy:  0.9861             | Val Loss:  0.0779             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 599.34it/s]


Epochs: 29 | Train Loss:  0.0673             | Train Accuracy:  0.9855             | Val Loss:  0.0757             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 605.18it/s]


Epochs: 30 | Train Loss:  0.0650             | Train Accuracy:  0.9859             | Val Loss:  0.0736             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:07<00:00, 609.76it/s]


Epochs: 31 | Train Loss:  0.0627             | Train Accuracy:  0.9870             | Val Loss:  0.0715             | Val Accuracy:  0.9760


100%|██████████| 4709/4709 [00:08<00:00, 588.09it/s]


Epochs: 32 | Train Loss:  0.0604             | Train Accuracy:  0.9869             | Val Loss:  0.0694             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 607.82it/s]


Epochs: 33 | Train Loss:  0.0585             | Train Accuracy:  0.9869             | Val Loss:  0.0674             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 607.11it/s]


Epochs: 34 | Train Loss:  0.0560             | Train Accuracy:  0.9890             | Val Loss:  0.0655             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 573.06it/s]


Epochs: 35 | Train Loss:  0.0541             | Train Accuracy:  0.9877             | Val Loss:  0.0636             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 597.56it/s]


Epochs: 36 | Train Loss:  0.0523             | Train Accuracy:  0.9882             | Val Loss:  0.0618             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 617.39it/s]


Epochs: 37 | Train Loss:  0.0501             | Train Accuracy:  0.9891             | Val Loss:  0.0600             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 590.27it/s]


Epochs: 38 | Train Loss:  0.0483             | Train Accuracy:  0.9899             | Val Loss:  0.0583             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 604.31it/s]


Epochs: 39 | Train Loss:  0.0466             | Train Accuracy:  0.9890             | Val Loss:  0.0566             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 602.16it/s]


Epochs: 40 | Train Loss:  0.0450             | Train Accuracy:  0.9888             | Val Loss:  0.0550             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 593.01it/s]


Epochs: 41 | Train Loss:  0.0429             | Train Accuracy:  0.9903             | Val Loss:  0.0534             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 607.04it/s]


Epochs: 42 | Train Loss:  0.0416             | Train Accuracy:  0.9903             | Val Loss:  0.0519             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 585.93it/s]


Epochs: 43 | Train Loss:  0.0399             | Train Accuracy:  0.9911             | Val Loss:  0.0505             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 613.16it/s]


Epochs: 44 | Train Loss:  0.0383             | Train Accuracy:  0.9923             | Val Loss:  0.0491             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 598.39it/s]


Epochs: 45 | Train Loss:  0.0371             | Train Accuracy:  0.9914             | Val Loss:  0.0478             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 576.40it/s]


Epochs: 46 | Train Loss:  0.0357             | Train Accuracy:  0.9921             | Val Loss:  0.0465             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:08<00:00, 587.61it/s]


Epochs: 47 | Train Loss:  0.0342             | Train Accuracy:  0.9932             | Val Loss:  0.0453             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 589.65it/s]


Epochs: 48 | Train Loss:  0.0329             | Train Accuracy:  0.9942             | Val Loss:  0.0441             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 582.69it/s]


Epochs: 49 | Train Loss:  0.0317             | Train Accuracy:  0.9933             | Val Loss:  0.0430             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 642.00it/s]


Epochs: 50 | Train Loss:  0.0307             | Train Accuracy:  0.9934             | Val Loss:  0.0420             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 623.30it/s]


Epochs: 51 | Train Loss:  0.0297             | Train Accuracy:  0.9939             | Val Loss:  0.0410             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 609.38it/s]


Epochs: 52 | Train Loss:  0.0284             | Train Accuracy:  0.9941             | Val Loss:  0.0401             | Val Accuracy:  0.9762


100%|██████████| 4709/4709 [00:07<00:00, 596.19it/s]


Epochs: 53 | Train Loss:  0.0275             | Train Accuracy:  0.9943             | Val Loss:  0.0392             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 560.32it/s]


Epochs: 54 | Train Loss:  0.0263             | Train Accuracy:  0.9942             | Val Loss:  0.0383             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 586.06it/s]


Epochs: 55 | Train Loss:  0.0255             | Train Accuracy:  0.9942             | Val Loss:  0.0375             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 583.58it/s]


Epochs: 56 | Train Loss:  0.0246             | Train Accuracy:  0.9944             | Val Loss:  0.0368             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 605.73it/s]


Epochs: 57 | Train Loss:  0.0238             | Train Accuracy:  0.9944             | Val Loss:  0.0361             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 595.96it/s]


Epochs: 58 | Train Loss:  0.0229             | Train Accuracy:  0.9946             | Val Loss:  0.0354             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 623.56it/s]


Epochs: 59 | Train Loss:  0.0223             | Train Accuracy:  0.9947             | Val Loss:  0.0347             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 617.88it/s]


Epochs: 60 | Train Loss:  0.0216             | Train Accuracy:  0.9946             | Val Loss:  0.0341             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 610.22it/s]


Epochs: 61 | Train Loss:  0.0208             | Train Accuracy:  0.9947             | Val Loss:  0.0336             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 608.00it/s]


Epochs: 62 | Train Loss:  0.0202             | Train Accuracy:  0.9946             | Val Loss:  0.0330             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 592.30it/s]


Epochs: 63 | Train Loss:  0.0195             | Train Accuracy:  0.9943             | Val Loss:  0.0325             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 590.81it/s]


Epochs: 64 | Train Loss:  0.0189             | Train Accuracy:  0.9947             | Val Loss:  0.0321             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:08<00:00, 572.23it/s]


Epochs: 65 | Train Loss:  0.0183             | Train Accuracy:  0.9947             | Val Loss:  0.0316             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 608.05it/s]


Epochs: 66 | Train Loss:  0.0178             | Train Accuracy:  0.9949             | Val Loss:  0.0312             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 591.83it/s]


Epochs: 67 | Train Loss:  0.0173             | Train Accuracy:  0.9949             | Val Loss:  0.0308             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 625.04it/s]


Epochs: 68 | Train Loss:  0.0169             | Train Accuracy:  0.9950             | Val Loss:  0.0304             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 636.51it/s]


Epochs: 69 | Train Loss:  0.0164             | Train Accuracy:  0.9947             | Val Loss:  0.0301             | Val Accuracy:  0.9764


100%|██████████| 4709/4709 [00:07<00:00, 619.91it/s]


Epochs: 70 | Train Loss:  0.0158             | Train Accuracy:  0.9952             | Val Loss:  0.0298             | Val Accuracy:  0.9767


100%|██████████| 4709/4709 [00:07<00:00, 609.64it/s]


Epochs: 71 | Train Loss:  0.0155             | Train Accuracy:  0.9949             | Val Loss:  0.0295             | Val Accuracy:  0.9767


100%|██████████| 4709/4709 [00:08<00:00, 586.64it/s]


Epochs: 72 | Train Loss:  0.0150             | Train Accuracy:  0.9949             | Val Loss:  0.0292             | Val Accuracy:  0.9767


100%|██████████| 4709/4709 [00:07<00:00, 624.11it/s]


Epochs: 73 | Train Loss:  0.0147             | Train Accuracy:  0.9951             | Val Loss:  0.0289             | Val Accuracy:  0.9767


100%|██████████| 4709/4709 [00:07<00:00, 595.73it/s]


Epochs: 74 | Train Loss:  0.0144             | Train Accuracy:  0.9950             | Val Loss:  0.0287             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:07<00:00, 590.36it/s]


Epochs: 75 | Train Loss:  0.0140             | Train Accuracy:  0.9952             | Val Loss:  0.0285             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:08<00:00, 582.88it/s]


Epochs: 76 | Train Loss:  0.0137             | Train Accuracy:  0.9952             | Val Loss:  0.0283             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:07<00:00, 640.58it/s]


Epochs: 77 | Train Loss:  0.0134             | Train Accuracy:  0.9950             | Val Loss:  0.0281             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:07<00:00, 615.32it/s]


Epochs: 78 | Train Loss:  0.0131             | Train Accuracy:  0.9952             | Val Loss:  0.0279             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:07<00:00, 597.06it/s]


Epochs: 79 | Train Loss:  0.0129             | Train Accuracy:  0.9951             | Val Loss:  0.0277             | Val Accuracy:  0.9769


100%|██████████| 4709/4709 [00:07<00:00, 627.86it/s]


Epochs: 80 | Train Loss:  0.0125             | Train Accuracy:  0.9950             | Val Loss:  0.0276             | Val Accuracy:  0.9769


# Test the model

In [37]:
mkdir_if_not_exist(MODEL_SAVE_PATH)
model = MLP(4, 2)
saved_model_name = 'ensemble2_0.9769_epoch80.pt'
model.load_state_dict(torch.load(f'{MODEL_SAVE_PATH}/{saved_model_name}'))
print('model path:', f'{MODEL_SAVE_PATH}/{saved_model_name}')
evaluate(model, val_dataset)


model path: /root/autodl-tmp/ensemble_model/combined/ensemble2_0.9769_epoch80.pt
data_id (true on bigvul): bigvul_18f39e7be0121317550d03e267e3ebd4dbfbb3ce
data_id (true on bigvul): bigvul_643f0fcf3b8ab09a68f0ecd2aa37aafeda3e63ef
data_id (true on bigvul): bigvul_bd23a7269834dc7c1f93e83535d16ebc44b75eba
data_id (true on bigvul): bigvul_bfc81a8bc18e3c4ba0cbaa7666ff76be2f998991
data_id (true on bigvul): bigvul_21fdcdd977e8ab479dd99c6d0d2f562dda98261d
data_id (true on bigvul): bigvul_8e75437a7e43d4c55e861691f74892e666e29b0b
data_id (true on bigvul): bigvul_20ce2fe8e3c211a42fee05a461a5881be9a8790e?w=1
data_id (true on bigvul): bigvul_7e768f8a473409215fe3fff8f6e31f8a3a0103c6
data_id (true on bigvul): bigvul_e9841fbdaf41b4a2baaa413f94d5c0197f9261f4
data_id (true on bigvul): bigvul_e40607cbe270a9e8360907cb1e62ddf0736e4864
data_id (true on bigvul): bigvul_4d06dd537f95683aba3651098ae288b7cbff8274
data_id (true on bigvul): bigvul_8835ba4a39cf53f705417b3b3a94eb067673f2c9
data_id (true on bigvul): b