该notebook用来微调decoder-only的生成式大模型。通过修改loss函数将next token prediction损失调整成交叉熵损失将生成式任务任务转化成二进制分类任务
分别在均衡和不均衡上的数据集上都进行了测试

In [None]:
#!/usr/bin/env python
import os
import pandas as pd
import torch
import deepspeed
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score

def tokenize_and_split(vul_csv_path="VulSC.csv",
                         nonvul_csv_path="nonVulSC.csv",
                         max_seq_length=512,
                         train_size=0.7, val_size=0.15, test_size=0.15):
    """
    Loads the CSV files, parses texts and labels, tokenizes, and splits the data
    into train, validation, and test sets using stratified sampling.
    Returns: (train_encodings, train_labels, val_encodings, val_labels, test_encodings, test_labels, tokenizer)
    """
    def load_single_csv(csv_path):
        try:
            df = pd.read_csv(csv_path, header=0)
            print(f"成功加载 {len(df)} 条数据: {os.path.basename(csv_path)}")
            if "source_code" in df.columns:
                df = df.rename(columns={"source_code": "text"})
            return df
        except Exception as e:
            raise ValueError(f"加载 {csv_path} 失败: {str(e)}")

    df_vul = load_single_csv(vul_csv_path)
    df_nonvul = load_single_csv(nonvul_csv_path)

    def parse_text_label(text):
        if not isinstance(text, str):
            raise ValueError(f"非文本输入: {text}")
        if "//" in text:
            parts = text.rsplit("//", 1)
            code = parts[0].strip()
            label = parts[1].strip()
        else:
            last_space_idx = text.rfind(' ')
            if last_space_idx == -1:
                raise ValueError(f"无效格式: {text}")
            code = text[:last_space_idx].strip()
            label = text[last_space_idx+1:].strip()
        if "<Vul>" in label:
            label = "<Vul>"
        elif "<nonVul>" in label:
            label = "<nonVul>"
        else:
            raise ValueError(f"未知标签: {label} in '{text}'")
        return code, label

    texts = []
    labels = []
    for df, expected_label in zip([df_nonvul, df_vul], ["<nonVul>", "<Vul>"]):
        for text in df["text"]:
            try:
                code, label = parse_text_label(text)
                if label != expected_label:
                    raise ValueError(f"文件标签不一致: {label} vs {expected_label}")
                texts.append(code)
                labels.append(1 if label == "<Vul>" else 0)
            except Exception as e:
                print(f"[!] 数据错误: {str(e)}")
                continue

    # Initialize tokenizer and add special tokens.
    tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/ISSRELLM4SCReplicate/ModelGPTJ4SC/model")
    tokenizer.pad_token = tokenizer.eos_token
    special_tokens = ["<nonVul>", "<Vul>"]
    num_added = tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    print(f"添加特殊token: {special_tokens} (已存在{len(special_tokens)-num_added}个)")

    # Tokenize all texts.
    encodings = tokenizer(
        texts,
        max_length=max_seq_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True
    )

    print(f"类别分布: nonVul={labels.count(0)}, Vul={labels.count(1)}")

    # Create indices and perform stratified splits.
    labels_np = np.array(labels)
    indices = np.arange(len(labels_np))
    train_idx, temp_idx = train_test_split(indices, test_size=(1 - train_size), stratify=labels_np, random_state=42)
    if len(temp_idx) > 0:
        val_prop = val_size / (val_size + test_size)
        val_idx, test_idx = train_test_split(temp_idx, test_size=(1 - val_prop), stratify=labels_np[temp_idx], random_state=42)
    else:
        val_idx, test_idx = np.array([]), np.array([])

    def slice_encodings(enc, idx):
        return {k: v[idx] for k, v in enc.items()}

    train_encodings = slice_encodings(encodings, train_idx)
    val_encodings = slice_encodings(encodings, val_idx)
    test_encodings = slice_encodings(encodings, test_idx)

    train_labels = [labels[i] for i in train_idx]
    val_labels = [labels[i] for i in val_idx]
    test_labels = [labels[i] for i in test_idx]

    return train_encodings, train_labels, val_encodings, val_labels, test_encodings, test_labels, tokenizer


def evaluate(model_engine, dataset, criterion, batch_size=4):
    """
    在验证集上计算平均损失、准确率、精确率、召回率、F1 和 ROC-AUC。
    其中，将 sample 级标签扩展为函数级标签，与模型输出的函数级 logits 对应。
    """
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model_engine.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(model_engine.local_rank)
            labels_batch = batch["labels"].to(model_engine.local_rank)
            outputs = model_engine(input_ids)
            # 计算损失
            loss = criterion(outputs, labels_batch, input_ids, use_contrast_loss=False)
            total_loss += loss.item() * labels_batch.size(0)
            # 将 sample 级标签扩展为函数级标签
            expanded_labels = []
            for i in range(labels_batch.size(0)):
                n_funcs = outputs["num_funcs"][i]
                # 注意：labels_batch[i] 是标量张量，使用 repeat(n_funcs) 得到一个 [n_funcs] 的张量
                expanded_labels.append(labels_batch[i].repeat(n_funcs))
            expanded_labels = torch.cat(expanded_labels, dim=0)

            # 预测结果
            preds = torch.argmax(outputs["func_logits"], dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(expanded_labels.cpu().numpy())

    avg_loss = total_loss / len(dataset)

    # 利用 sklearn 计算各指标
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    try:
        roc_auc = roc_auc_score(all_labels, all_preds)
    except Exception:
        roc_auc = 0.0

    return avg_loss, accuracy, precision, recall, f1, roc_auc

In [None]:
vul_csv_path="VulSC.csv"
nonvul_csv_path="nonVulSC.csv"
def load_single_csv(csv_path):
        try:
            df = pd.read_csv(csv_path, header=0)
            print(f"成功加载 {len(df)} 条数据: {os.path.basename(csv_path)}")
            if "source_code" in df.columns:
                df = df.rename(columns={"source_code": "text"})
            return df
        except Exception as e:
            raise ValueError(f"加载 {csv_path} 失败: {str(e)}")

df_vul = load_single_csv(vul_csv_path)
df_nonvul = load_single_csv(nonvul_csv_path)

def parse_text_label(text):
    if not isinstance(text, str):
        raise ValueError(f"非文本输入: {text}")
    if "//" in text:
        parts = text.rsplit("//", 1)
        code = parts[0].strip()
        label = parts[1].strip()
    else:
        last_space_idx = text.rfind(' ')
        if last_space_idx == -1:
            raise ValueError(f"无效格式: {text}")
        code = text[:last_space_idx].strip()
        label = text[last_space_idx+1:].strip()
    if "<Vul>" in label:
        label = "<Vul>"
    elif "<nonVul>" in label:
        label = "<nonVul>"
    else:
        raise ValueError(f"未知标签: {label} in '{text}'")
    return code, label

texts = []
labels = []
for df, expected_label in zip([df_nonvul, df_vul], ["<nonVul>", "<Vul>"]):
    for text in df["text"]:
        try:
            code, label = parse_text_label(text)
            if label != expected_label:
                raise ValueError(f"文件标签不一致: {label} vs {expected_label}")
            texts.append(code)
            labels.append(1 if label == "<Vul>" else 0)
        except Exception as e:
            print(f"[!] 数据错误: {str(e)}")
            continue

In [None]:
train_encodings, train_labels, val_encodings, val_labels, test_encodings, test_labels, tokenizer = tokenize_and_split(
        vul_csv_path="VulSC.csv",
        nonvul_csv_path="nonVulSC.csv",
        max_seq_length=512,
        train_size=0.7, val_size=0.15, test_size=0.15
    )

In [None]:
train_dataset = ContractDataset(train_encodings, train_labels)
val_dataset = ContractDataset(val_encodings, val_labels)
test_dataset = ContractDataset(test_encodings, test_labels)
print(train_dataset[:10])

In [None]:
def train():
    # 请替换下列路径为自己的数据集文件路径
    train_encodings, train_labels, val_encodings, val_labels, test_encodings, test_labels, tokenizer = tokenize_and_split(
        vul_csv_path="VulSC.csv",
        nonvul_csv_path="nonVulSC.csv",
        max_seq_length=512,
        train_size=0.7, val_size=0.15, test_size=0.15
    )

    train_dataset = ContractDataset(train_encodings, train_labels)
    val_dataset = ContractDataset(val_encodings, val_labels)
    test_dataset = ContractDataset(test_encodings, test_labels)

    # 使用更新后的 QuintupletSampler
    class_counts = [train_labels.count(0), train_labels.count(1)]
    train_sampler = QuintupletSampler(train_labels, class_counts, num_anchors=1)
    train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    func_sep_token_id = tokenizer.convert_tokens_to_ids("<func_sep>")
    model = VulDetector(func_sep_token_id=func_sep_token_id).to(device)
    criterion = HybridLoss(margin1=0.2, margin2=0.2, margin3=0.2, lm_weight=0.2)
    writer = SummaryWriter()

    ds_config = "configs/ds_config.json"
    model_engine, optimizer, _, _ = deepspeed.initialize(
        model=model,
        config=ds_config,
        model_parameters=model.parameters()
    )

    num_epochs = 10
    global_step = 0
    best_val_accuracy = 0.0
    best_epoch = -1

    # 记录各 epoch 指标与损失项
    epochs_arr = []
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    train_precs = []
    val_precs = []
    train_recalls = []
    val_recalls = []
    train_f1s = []
    val_f1s = []
    train_roc_aucs = []
    val_roc_aucs = []
    epoch_ce_losses = []
    epoch_contrast_losses = []
    epoch_lm_losses = []
    epoch_total_losses = []

    for epoch in range(num_epochs):
        model_engine.train()
        epoch_loss = 0.0
        epoch_correct = 0
        epoch_total = 0
        sum_ce, sum_contrast, sum_lm, sum_total = 0.0, 0.0, 0.0, 0.0
        num_batches = 0

        for batch in train_loader:
            input_ids = batch["input_ids"].to(model_engine.local_rank)
            labels_batch = batch["labels"].to(model_engine.local_rank)

            outputs = model_engine(input_ids)
            # 调用 loss 时返回各部分损失
            loss, components = criterion(outputs, labels_batch, input_ids, return_components=True)
            model_engine.backward(loss)
            model_engine.step()

            epoch_loss += loss.item()
            sum_ce += components["ce_loss"].item()
            sum_contrast += components["contrast_loss"] if isinstance(components["contrast_loss"], float) else components["contrast_loss"].item()
            sum_lm += components["lm_loss"].item()
            sum_total += components["total_loss"].item()

            # 扩展标签计算函数级准确率
            expanded_labels = []
            for i in range(labels_batch.size(0)):
                n_funcs = outputs["num_funcs"][i]
                expanded_labels.append(labels_batch[i].repeat(n_funcs))
            expanded_labels = torch.cat(expanded_labels, dim=0).to(model_engine.local_rank)
            preds = torch.argmax(outputs["func_logits"], dim=1)
            epoch_correct += (preds == expanded_labels).sum().item()
            epoch_total += preds.size(0)

            global_step += 1
            writer.add_scalar("Batch Loss", loss.item(), global_step)
            num_batches += 1

        avg_train_loss = epoch_loss / len(train_loader)
        train_accuracy = epoch_correct / epoch_total if epoch_total > 0 else 0

        avg_ce = sum_ce / num_batches
        avg_contrast = sum_contrast / num_batches
        avg_lm = sum_lm / num_batches
        avg_total = sum_total / num_batches

        # 验证阶段
        val_loss, val_accuracy, val_precision, val_recall, val_f1, val_roc = evaluate(model_engine, val_dataset, criterion)
        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Train Acc = {train_accuracy:.4f}")
        print(f"          Val Loss = {val_loss:.4f}, Val Acc = {val_accuracy:.4f}, Val Prec = {val_precision:.4f}, Val Recall = {val_recall:.4f}, Val F1 = {val_f1:.4f}, Val ROC-AUC = {val_roc:.4f}")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_epoch = epoch
            best_checkpoint_dir = "checkpoints/best"
            os.makedirs(best_checkpoint_dir, exist_ok=True)
            model_engine.save_checkpoint(best_checkpoint_dir)
            print(f"Best checkpoint saved at epoch {epoch}")

        epochs_arr.append(epoch)
        train_losses.append(avg_train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
        train_precs.append(train_accuracy)  # 示例，实际请计算
        train_recalls.append(train_accuracy)
        train_f1s.append(train_accuracy)
        train_roc_aucs.append(train_accuracy)
        val_precs.append(val_precision)
        val_recalls.append(val_recall)
        val_f1s.append(val_f1)
        val_roc_aucs.append(val_roc)

        epoch_ce_losses.append(avg_ce)
        epoch_contrast_losses.append(avg_contrast)
        epoch_lm_losses.append(avg_lm)
        epoch_total_losses.append(avg_total)

        writer.add_scalar("Epoch Train Loss", avg_train_loss, epoch)
        writer.add_scalar("Epoch Val Loss", val_loss, epoch)
        writer.add_scalar("Epoch Val Accuracy", val_accuracy, epoch)

    # 调用可视化模块生成图表
    visualization.plot_training_validation_loss(epochs_arr, train_losses, val_losses, best_val_loss=min(val_losses), best_epoch=best_epoch)
    visualization.plot_metrics_curves(epochs_arr, train_accs, val_accs,
                                      train_precs, val_precs,
                                      train_recalls, val_recalls,
                                      train_f1s, val_f1s,
                                      train_roc_aucs, val_roc_aucs)
    visualization.plot_loss_components(epochs_arr, epoch_ce_losses, epoch_contrast_losses, epoch_lm_losses, epoch_total_losses)

    # 此处可根据需要调用 ROC/PR 曲线、t-SNE、混淆矩阵等函数生成对应图表
    # 例如，利用测试集预测结果生成混淆矩阵
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
    preds_list = []
    true_list = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(model_engine.local_rank)
            labels_batch = batch["labels"].to(model_engine.local_rank)
            outputs = model_engine(input_ids)
            expanded_labels = []
            for i in range(labels_batch.size(0)):
                n_funcs = outputs["num_funcs"][i]
                expanded_labels.append(labels_batch[i].repeat(n_funcs))
            expanded_labels = torch.cat(expanded_labels, dim=0).to(model_engine.local_rank)
            preds = torch.argmax(outputs["func_logits"], dim=1)
            preds_list.extend(preds.cpu().numpy())
            true_list.extend(expanded_labels.cpu().numpy())
    visualization.plot_confusion_matrix(true_list, preds_list, classes=["nonVul", "Vul"], normalize=True)

    print("Training complete. Final checkpoint saved.")

if __name__ == "__main__":
    train()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import pyarrow.parquet as pq
import re
import json

##################################
# Step 1: Merge the three Parquet files
##################################

def merge_parquet_files(train_file, val_file, test_file, output_file):
    """
    Merges three Parquet files (train, val, test) into one dataset.parquet.
    """
    df_train = pd.read_parquet(train_file)
    df_val = pd.read_parquet(val_file)
    df_test = pd.read_parquet(test_file)

    df_merged = pd.concat([df_train, df_val, df_test], ignore_index=True)
    df_merged.to_parquet(output_file, index=False)
    print(f"[+] Merged dataset saved to: {output_file}")
    return df_merged

##################################
# Step 2: Parse vulnerabilities
##################################

def parse_vulnerabilities(overlapping_str):
    """
    overlapping_str is a JSON-like string describing vulnerabilities and line numbers.
    e.g. {
      "IOU": [
        [
          {"defect": "...", "lines": [178], "tool": "soldetector", ...},
          {"error": "Integer Overflow.", "line": 178, "tool": "oyente"}
        ],
        [
          ...
        ]
      ],
      "NC": [...]
    }

    We collect *all* line numbers flagged as vulnerable (1-based).
    Returns a set of integers representing vulnerable line numbers.
    """
    vul_lines = set()
    if not isinstance(overlapping_str, str) or not overlapping_str.strip():
        return vul_lines

    try:
        data = json.loads(overlapping_str)
    except:
        # If JSON is invalid, return empty
        return vul_lines

    # The structure can vary, but typically each vulnerability type has sub-lists
    # Each sub-list has multiple items from different tools, which may store
    # line data in "line" or "lines" fields.
    for vuln_type, sublists in data.items():
        for sublist in sublists:
            line_candidates = []
            for item in sublist:
                # item might have "lines": [X, Y], or "line": X
                if "lines" in item and isinstance(item["lines"], list):
                    line_candidates.extend(item["lines"])
                if "line" in item and isinstance(item["line"], int):
                    line_candidates.append(item["line"])
            for ln in line_candidates:
                vul_lines.add(ln)

    return vul_lines

##################################
# Step 3: Extract function ranges
##################################

FUNCTION_RE = re.compile(r"^\s*(function|constructor)\b")

def extract_function_ranges(source_code):
    """
    Returns a list of (start_line, end_line, function_signature).
    A naive approach that scans for lines starting with 'function' or 'constructor'
    and counts curly braces to find the end of the function.

    Lines are 1-based here.
    """
    lines = source_code.split('\n')
    n = len(lines)

    func_ranges = []
    in_function = False
    brace_depth = 0
    func_start_line = None
    func_sig = None

    for i, line in enumerate(lines):
        if not in_function:
            # Look for a function or constructor definition
            if FUNCTION_RE.search(line):
                in_function = True
                func_start_line = i + 1  # 1-based
                func_sig = line.strip()
                brace_depth = line.count('{') - line.count('}')
        else:
            brace_depth += (line.count('{') - line.count('}'))

            if brace_depth <= 0:
                func_end_line = i + 1
                func_ranges.append((func_start_line, func_end_line, func_sig))
                in_function = False
                func_start_line = None
                func_sig = None

    # If the file ends but we never closed the function
    if in_function and func_start_line is not None:
        func_ranges.append((func_start_line, n, func_sig))

    return func_ranges

##################################
# Step 4: Label functions with <Vul> or <nonVul>
##################################

def label_functions(source_code, vul_lines):
    """
    - For each function in source_code (based on naive brace count),
      check if any line in [start_line, end_line] is in vul_lines.
    - If yes, append "// <Vul>" on the last line of that function.
      Otherwise, "// <nonVul>".
    - Return:
        1) The entire contract with appended labels at function-level
        2) A list of function snippets, each with a label field
    """
    lines = source_code.split('\n')
    func_ranges = extract_function_ranges(source_code)

    function_snippets = []

    for (start_line, end_line, signature) in func_ranges:
        # Check if this function has a vulnerable line
        is_vul = any(start_line <= ln <= end_line for ln in vul_lines)

        label_str = "// <Vul>" if is_vul else "// <nonVul>"
        last_idx = end_line - 1  # 0-based index
        if 0 <= last_idx < len(lines):
            lines[last_idx] += f" {label_str}"

        # Extract just this function's snippet
        snippet_lines = lines[start_line-1 : end_line]
        snippet_text = "\n".join(snippet_lines)

        function_snippets.append({
            "snippet": snippet_text,
            "label": "<Vul>" if is_vul else "<nonVul>"
        })

    labeled_code = "\n".join(lines)
    return labeled_code, function_snippets

##################################
# Step 5: Main driver
##################################

def main():
    # 1) Merge parquet files into dataset.parquet
    train_file = "train.parquet"
    val_file = "validation.parquet"
    test_file = "test.parquet"
    merged_output_file = "dataset.parquet"

    print("[+] Merging Parquet files...")
    df_merged = merge_parquet_files(train_file, val_file, test_file, merged_output_file)

    # 2) For each row, parse the vulnerabilities, label at function-level
    #    Collect function snippets into either VulSC.csv or nonVulSC.csv
    vul_functions = []
    non_vul_functions = []

    print("[+] Labeling each contract's functions...")

    for idx, row in df_merged.iterrows():
        source_code = row["source_code"]
        overlapping_str = row["overlapping"]

        # parse all vulnerable lines
        vul_lines = parse_vulnerabilities(overlapping_str)

        # label function-level
        labeled_code, func_snippets = label_functions(source_code, vul_lines)

        # Collect snippets
        for item in func_snippets:
            snippet_text = item["snippet"]
            label = item["label"]
            if label == "<Vul>":
                vul_functions.append(snippet_text)
            else:
                non_vul_functions.append(snippet_text)

    # 3) Save to VulSC.csv and nonVulSC.csv
    df_vul = pd.DataFrame({"source_code": vul_functions})
    df_nonvul = pd.DataFrame({"source_code": non_vul_functions})

    df_vul.to_csv("VulSC.csv", index=False)
    df_nonvul.to_csv("nonVulSC.csv", index=False)

    print("[+] Saved VulSC.csv (vulnerable functions) and nonVulSC.csv (non-vulnerable functions).")

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Script Overview:
1. Load `VulSC.csv` and `nonVulSC.csv` containing vulnerable and non-vulnerable functions.
2. Initialize the GPT-J-6B tokenizer with special tokens `<Vul>` and `<nonVul>`.
3. Count tokens in each function.
4. Bin the token counts into specified ranges.
5. Display the distribution of token counts.
6. Optionally, save the enhanced datasets with token counts and bins.
"""

import pandas as pd
import numpy as np
from transformers import GPT2TokenizerFast
from tqdm import tqdm

def load_datasets(vul_csv_path, nonvul_csv_path):
    """
    Load vulnerable and non-vulnerable function datasets.

    Args:
        vul_csv_path (str): Path to `VulSC.csv`.
        nonvul_csv_path (str): Path to `nonVulSC.csv`.

    Returns:
        tuple: (df_vul, df_nonvul) DataFrames for vulnerable and non-vulnerable functions.
    """
    print("[+] Loading datasets...")
    df_vul = pd.read_csv(vul_csv_path)
    df_nonvul = pd.read_csv(nonvul_csv_path)
    print(f"[+] Loaded {len(df_vul)} vulnerable functions and {len(df_nonvul)} non-vulnerable functions.")
    return df_vul, df_nonvul

def initialize_tokenizer(special_tokens):
    """
    Initialize the GPT-J-6B tokenizer and add special tokens.

    Args:
        special_tokens (list of str): Special tokens to add.

    Returns:
        GPT2TokenizerFast: Configured tokenizer.
    """
    print("[+] Initializing GPT-J-6B tokenizer...")
    tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-j-6B")

    # Add special tokens
    print("[+] Adding special tokens to the tokenizer...")
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    return tokenizer

def count_tokens(df, tokenizer):
    """
    Count the number of tokens in each function using the tokenizer.

    Args:
        df (pd.DataFrame): DataFrame containing 'source_code'.
        tokenizer (GPT2TokenizerFast): Configured tokenizer.

    Returns:
        pd.DataFrame: DataFrame with an added 'num_tokens' column.
    """
    print("[+] Counting tokens in functions...")

    def tokenize_and_count(code):
        tokens = tokenizer.encode(code, add_special_tokens=False)
        return len(tokens)

    # Apply token counting with progress bar
    tqdm.pandas(desc="Tokenizing Functions")
    df["num_tokens"] = df["source_code"].progress_apply(tokenize_and_count)

    return df

def bin_tokens(df, bins, labels):
    """
    Bin the token counts into specified ranges.

    Args:
        df (pd.DataFrame): DataFrame with 'num_tokens' column.
        bins (list of int): Bin edges.
        labels (list of str): Labels for the bins.

    Returns:
        pd.DataFrame: DataFrame with an added 'token_bin' column.
    """
    print("[+] Binning token counts...")
    df["token_bin"] = pd.cut(df["num_tokens"], bins=bins, labels=labels, right=False)
    return df

def display_distributions(df_vul, df_nonvul, bins, labels):
    """
    Display the distribution of token counts.

    Args:
        df_vul (pd.DataFrame): Vulnerable functions DataFrame.
        df_nonvul (pd.DataFrame): Non-vulnerable functions DataFrame.
        bins (list of int): Bin edges.
        labels (list of str): Labels for the bins.
    """
    num_vul_funcs = len(df_vul)
    num_nonvul_funcs = len(df_nonvul)
    total_funcs = num_vul_funcs + num_nonvul_funcs

    print("\n===== Function Counts =====")
    print(f"Vulnerable Functions: {num_vul_funcs}")
    print(f"Non-Vulnerable Functions: {num_nonvul_funcs}")
    print(f"Total Functions: {total_funcs}")

    print("\n===== Token Count Distribution =====")
    print("\n[+] Vulnerable Functions:")
    vul_dist = df_vul["token_bin"].value_counts(sort=False).rename_axis('Token Bin').reset_index(name='Counts')
    print(vul_dist)

    print("\n[+] Non-Vulnerable Functions:")
    nonvul_dist = df_nonvul["token_bin"].value_counts(sort=False).rename_axis('Token Bin').reset_index(name='Counts')
    print(nonvul_dist)

    print("\n[+] All Functions Combined:")
    combined_df = pd.concat([df_vul, df_nonvul], ignore_index=True)
    combined_dist = combined_df["token_bin"].value_counts(sort=False).rename_axis('Token Bin').reset_index(name='Counts')
    print(combined_dist)

def save_enhanced_datasets(df_vul, df_nonvul, vul_out_csv, nonvul_out_csv):
    """
    Save the DataFrames with token counts and bins to CSV files.

    Args:
        df_vul (pd.DataFrame): Vulnerable functions DataFrame.
        df_nonvul (pd.DataFrame): Non-vulnerable functions DataFrame.
        vul_out_csv (str): Output path for vulnerable functions.
        nonvul_out_csv (str): Output path for non-vulnerable functions.
    """
    print("[+] Saving enhanced datasets with token counts and bins...")
    df_vul.to_csv(vul_out_csv, index=False)
    df_nonvul.to_csv(nonvul_out_csv, index=False)
    print(f"[+] Saved {vul_out_csv} and {nonvul_out_csv}.")

def main():
    # File paths
    vul_csv_path = "VulSC.csv"
    nonvul_csv_path = "nonVulSC.csv"
    vul_out_csv = "VulSC_with_tokens.csv"
    nonvul_out_csv = "nonVulSC_with_tokens.csv"

    # Token bins and labels
    bins = [0, 50, 100, 200, 300, 500, 1000, 2000, np.inf]
    labels = ["0-50", "50-100", "100-200", "200-300", "300-500", "500-1000", "1000-2000", ">2000"]

    # Special tokens to add
    special_tokens = ["<Vul>", "<nonVul>"]

    # Step 1: Load datasets
    df_vul, df_nonvul = load_datasets(vul_csv_path, nonvul_csv_path)

    # Step 2: Initialize tokenizer
    tokenizer = initialize_tokenizer(special_tokens)

    # Step 3: Count tokens
    df_vul = count_tokens(df_vul, tokenizer)
    df_nonvul = count_tokens(df_nonvul, tokenizer)

    # Step 4: Bin tokens
    df_vul = bin_tokens(df_vul, bins, labels)
    df_nonvul = bin_tokens(df_nonvul, bins, labels)

    # Step 5: Display distributions
    display_distributions(df_vul, df_nonvul, bins, labels)

    # Step 6: Save enhanced datasets
    save_enhanced_datasets(df_vul, df_nonvul, vul_out_csv, nonvul_out_csv)

if __name__ == "__main__":
    main()

In [None]:
#build the balanced dataset
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Script Overview:
1. Load `VulSC.csv` and `nonVulSC.csv`.
2. Add a `label` column to each dataset (`1` for vulnerable, `0` for non-vulnerable).
3. Randomly select 714 non-vulnerable functions to balance the dataset.
4. Combine the selected non-vulnerable functions with all vulnerable functions.
5. Shuffle the combined dataset.
6. Save the balanced dataset as `BalancedDataset.csv`.
"""

import pandas as pd
from tqdm import tqdm

def load_datasets(vul_csv_path, nonvul_csv_path):
    """
    Load vulnerable and non-vulnerable function datasets.

    Args:
        vul_csv_path (str): Path to `VulSC.csv`.
        nonvul_csv_path (str): Path to `nonVulSC.csv`.

    Returns:
        tuple: (df_vul, df_nonvul) DataFrames for vulnerable and non-vulnerable functions.
    """
    print("[+] Loading datasets...")

    try:
        df_vul = pd.read_csv(vul_csv_path)
        print(f"[+] Loaded {len(df_vul)} vulnerable functions from {vul_csv_path}.")
    except FileNotFoundError:
        print(f"[-] File not found: {vul_csv_path}")
        exit(1)
    except Exception as e:
        print(f"[-] Error loading {vul_csv_path}: {e}")
        exit(1)

    try:
        df_nonvul = pd.read_csv(nonvul_csv_path)
        print(f"[+] Loaded {len(df_nonvul)} non-vulnerable functions from {nonvul_csv_path}.")
    except FileNotFoundError:
        print(f"[-] File not found: {nonvul_csv_path}")
        exit(1)
    except Exception as e:
        print(f"[-] Error loading {nonvul_csv_path}: {e}")
        exit(1)

    return df_vul, df_nonvul

def add_labels(df_vul, df_nonvul):
    """
    Add a `label` column to each DataFrame.

    Args:
        df_vul (pd.DataFrame): Vulnerable functions DataFrame.
        df_nonvul (pd.DataFrame): Non-vulnerable functions DataFrame.

    Returns:
        tuple: Updated DataFrames with `label` columns.
    """
    print("[+] Adding label columns...")
    df_vul = df_vul.copy()
    df_nonvul = df_nonvul.copy()

    df_vul['label'] = 1
    df_nonvul['label'] = 0

    return df_vul, df_nonvul

def sample_nonvulnerable(df_nonvul, sample_size, random_state=42):
    """
    Randomly sample non-vulnerable functions.

    Args:
        df_nonvul (pd.DataFrame): Non-vulnerable functions DataFrame.
        sample_size (int): Number of samples to select.
        random_state (int): Seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled non-vulnerable functions.
    """
    print(f"[+] Sampling {sample_size} non-vulnerable functions...")

    if len(df_nonvul) < sample_size:
        print(f"[-] Not enough non-vulnerable functions to sample. Available: {len(df_nonvul)}, Required: {sample_size}")
        exit(1)

    df_nonvul_sampled = df_nonvul.sample(n=sample_size, random_state=random_state).reset_index(drop=True)
    print(f"[+] Sampled {len(df_nonvul_sampled)} non-vulnerable functions.")

    return df_nonvul_sampled

def create_balanced_dataset(df_vul, df_nonvul_sampled):
    """
    Combine vulnerable and sampled non-vulnerable functions into a balanced dataset.

    Args:
        df_vul (pd.DataFrame): Vulnerable functions DataFrame.
        df_nonvul_sampled (pd.DataFrame): Sampled non-vulnerable functions DataFrame.

    Returns:
        pd.DataFrame: Combined and shuffled balanced dataset.
    """
    print("[+] Combining vulnerable and non-vulnerable functions...")
    balanced_df = pd.concat([df_vul, df_nonvul_sampled], ignore_index=True)
    print(f"[+] Combined dataset size: {len(balanced_df)} functions.")

    print("[+] Shuffling the combined dataset...")
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print("[+] Shuffled the combined dataset.")

    return balanced_df

def save_balanced_dataset(balanced_df, output_csv_path):
    """
    Save the balanced dataset to a CSV file.

    Args:
        balanced_df (pd.DataFrame): Balanced dataset DataFrame.
        output_csv_path (str): Path to save the balanced dataset CSV.
    """
    print(f"[+] Saving balanced dataset to {output_csv_path}...")
    try:
        balanced_df.to_csv(output_csv_path, index=False)
        print(f"[+] Balanced dataset saved successfully to {output_csv_path}.")
    except Exception as e:
        print(f"[-] Error saving balanced dataset: {e}")
        exit(1)

def main():
    # File paths
    vul_csv_path = "VulSC.csv"
    nonvul_csv_path = "nonVulSC.csv"
    output_balanced_csv = "BalancedDataset.csv"
    sample_size = 714  # Number of non-vulnerable functions to sample

    # Step 1: Load datasets
    df_vul, df_nonvul = load_datasets(vul_csv_path, nonvul_csv_path)

    # Step 2: Add label columns
    df_vul, df_nonvul = add_labels(df_vul, df_nonvul)

    # Step 3: Sample non-vulnerable functions
    df_nonvul_sampled = sample_nonvulnerable(df_nonvul, sample_size, random_state=42)

    # Step 4: Create balanced dataset
    balanced_df = create_balanced_dataset(df_vul, df_nonvul_sampled)

    # Optional: Verify the balance
    print("\n===== Balanced Dataset Summary =====")
    print(balanced_df['label'].value_counts())

    # Step 5: Save the balanced dataset
    save_balanced_dataset(balanced_df, output_balanced_csv)

    print("\n[+] Balanced dataset creation completed successfully.")

if __name__ == "__main__":
    main()

In [None]:
#initialize the tokenizer with special tokens
#GPTJ-6B uses a tokenizer compatible with GPT-2. we'll add <vul> and <nonvul> as special tokens

from transformers import GPT2TokenizerFast

# Initialize the tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-j-6B")

# Add special tokens
special_tokens_dict = {'additional_special_tokens': ['<Vul>', '<nonVul>', '<EOS>']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_toks} special tokens.")


In [None]:
#define custom datasets for next-token-prediction loss and binary classification loss
# next token prediction: each training example is formatted as function_code+label+<EOS>
#binary classification : each training example is formated as function_code+<EOS>, with labels indicating vulnerability
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the balanced dataset
balanced_df = pd.read_csv("BalancedDataset.csv")

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# # Save splits if needed
# train_df.to_csv("train_balanced.csv", index=False)
# val_df.to_csv("validation_balanced.csv", index=False)
# test_df.to_csv("test_balanced.csv", index=False)

#create NTP dataset
def create_ntp_format(df):
    """
    Formats the dataset for Next Token Prediction.
    Each example: function_code + label + <EOS>

    Args:
        df (pd.DataFrame): DataFrame with 'source_code' and 'label' columns.

    Returns:
        List[str]: List of formatted strings.
    """
    ntp_data = []
    for _, row in df.iterrows():
        function_code = row['source_code']
        label = "<Vul>" if row['label'] == 1 else "<nonVul>"
        formatted = f"{function_code} {label} <EOS>"
        ntp_data.append(formatted)
    return ntp_data

#create BC dataset
def create_bc_format(df):
    """
    Formats the dataset for Binary Classification.
    Each example: function_code + <EOS>

    Args:
        df (pd.DataFrame): DataFrame with 'source_code' and 'label' columns.

    Returns:
        List[dict]: List of dictionaries with 'text' and 'label' keys.
    """
    bc_data = []
    for _, row in df.iterrows():
        function_code = row['source_code']
        label = row['label']
        formatted = f"{function_code} <EOS>"
        bc_data.append({'text': formatted, 'label': label})
    return bc_data

#set up the GPTJ-6B model with LoRA
#use PEFT library to apply LoRA to finetune GPTJ-6B for efficient fine-tuning
from transformers import GPTJForCausalLM, AutoConfig
from peft import LoraConfig, get_peft_model

# Load model configuration
config = AutoConfig.from_pretrained("ModelGPTJ4SC/model")
config.num_labels = 1  # For binary classification

# Load the GPT-J-6B model
model = GPTJForCausalLM.from_pretrained(
    "ModelGPTJ4SC/model",
    load_in_8bit=True,  # Enable int8 quantization
    device_map="auto"
)

# Resize token embeddings to accommodate new special tokens
model.resize_token_embeddings(len(tokenizer))

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules = ["q_proj", "v_proj", "fc_out"],
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
#define traing arguments and trainer
#define custom datasets for NTP and BC, respectively
from datasets import Dataset
from transformers import TrainingArguments
import torch
from torch import nn
from transformers import Trainer

tokenizer.pad_token = tokenizer.eos_token

# Create NTP dataset
ntp_train_data = create_ntp_format(train_df)
ntp_val_data = create_ntp_format(val_df)
ntp_test_data = create_ntp_format(test_df)

ntp_train_dataset = Dataset.from_dict({"text": ntp_train_data})
ntp_val_dataset = Dataset.from_dict({"text": ntp_val_data})
ntp_test_dataset = Dataset.from_dict({"text": ntp_test_data})

# Create BC dataset
bc_train_data = create_bc_format(train_df)
bc_val_data = create_bc_format(val_df)
bc_test_data = create_bc_format(test_df)

bc_train_dataset = Dataset.from_list(bc_train_data)
bc_val_dataset = Dataset.from_list(bc_val_data)
bc_test_dataset = Dataset.from_list(bc_test_data)

#tokenization functions
#NTP tokenization
def tokenize_ntp_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=1024)

#BC tokenization
def tokenize_bc_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=1024)

#apply tokenization
# Tokenize NTP datasets
ntp_train_tokenized = ntp_train_dataset.map(tokenize_ntp_function, batched=True)
ntp_val_tokenized = ntp_val_dataset.map(tokenize_ntp_function, batched=True)
ntp_test_tokenized = ntp_test_dataset.map(tokenize_ntp_function, batched=True)

# Tokenize BC datasets
bc_train_tokenized = bc_train_dataset.map(tokenize_bc_function, batched=True)
bc_val_tokenized = bc_val_dataset.map(tokenize_bc_function, batched=True)
bc_test_tokenized = bc_test_dataset.map(tokenize_bc_function, batched=True)

# Set format for PyTorch
ntp_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
ntp_val_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
ntp_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])

bc_train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
bc_val_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
bc_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

#define training arguments
#define separate training arguments for NTP and BC

# Common training arguments
common_training_args = {
    #"output_dir": "./gptj_finetuned",
    "overwrite_output_dir": True,
    "num_train_epochs": 5,
    "per_device_train_batch_size": 1,  # Adjust based on GPU memory
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 16,  # To simulate larger batch size
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    #"logging_dir": "./logs",
    "logging_steps": 100,
    "fp16": True,  # Enable mixed precision
    #"gradient_checkpointing": True,  # Save memory
    "report_to": "tensorboard",  # 'none' Disables reporting to WandB or other platforms
    #"remove_unused_columns": False,
    # "load_best_model_at_end": True,  # Enable loading the best model
    # "metric_for_best_model": "accuracy",  # Specify the metric for best model
    # "greater_is_better": True,  # Whether the metric should be maximized
}

# NTP training arguments
ntp_training_args = TrainingArguments(
    **common_training_args,
    output_dir="./gptj_finetuned_ntp",
    logging_dir="./logs_ntp",
)

# BC training arguments
bc_training_args = TrainingArguments(
    **common_training_args,
    output_dir="./gptj_finetuned_bc",
    logging_dir="./logs_bc",
)

#define data collators
#for NTP, next token prediction, same as the orignal task, so we can load by libraries
from transformers import DataCollatorForLanguageModeling

# NTP Data Collator
ntp_data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM
)

# def bc_data_collator(features):
#         batch = {
#             'input_ids': torch.stack([f['input_ids'] for f in features]),
#             'attention_mask': torch.stack([f['attention_mask'] for f in features]),
#             'labels': torch.tensor([f['label'] for f in features])
#         }
#         return batch

#for BC, we need implement a custom data collator to handle labels

class BinaryClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Custom loss computation for Binary Classification.

        Args:
            model: The model to compute the loss with.
            inputs: A batch of inputs.
            return_outputs: Whether to return model outputs.

        Returns:
            The loss, and optionally the model outputs.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits  # [batch_size, seq_len, vocab_size]

        # Find the index of <EOS>
        eos_token_id = tokenizer.convert_tokens_to_ids('<EOS>')
        # Mask to select only the logits corresponding to <EOS>
        eos_mask = (inputs['input_ids'] == eos_token_id)
        # For each example in the batch, get the logits at the <EOS> position
        eos_logits = []
        for i in range(logits.size(0)):
            eos_positions = torch.where(eos_mask[i])[0]
            if len(eos_positions) == 0:
                # If <EOS> not found, take the last token
                eos_pos = logits.size(1) - 1
            else:
                # Take the last occurrence of <EOS>
                eos_pos = eos_positions[-1]
            eos_logits.append(logits[i, eos_pos, :])
        eos_logits = torch.stack(eos_logits)  # [batch_size, vocab_size]

        # For binary classification, map logits to a single probability
        # We'll map to the probability of predicting '<Vul>' as positive class
        # Assuming '<Vul>' and '<nonVul>' are part of the vocabulary
        vul_token_id = tokenizer.convert_tokens_to_ids('<Vul>')
        nonvul_token_id = tokenizer.convert_tokens_to_ids('<nonVul>')

        # Compute probability for '<Vul>'
        vul_probs = torch.softmax(eos_logits, dim=-1)[:, vul_token_id]

        # Labels should be 1 or 0
        labels = labels.float()

        # Define Binary Cross-Entropy loss
        loss_fn = nn.BCELoss()
        loss = loss_fn(vul_probs, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support
from torch import nn
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import EarlyStoppingCallback
import math

def compute_metrics_ntp(eval_pred):
    """
    Compute metrics for NTP evaluation.
    For NTP, perplexity is a standard metric.
    """
    eval_loss = eval_pred.get("eval_loss", None)
    if eval_loss is not None:
        perplexity = math.exp(eval_loss)
    else:
        perplexity = 'N/A'
    print(f"[DEBUG] NTP - eval_loss: {eval_loss}, perplexity: {perplexity}")  # Debugging
    return {"perplexity": perplexity}


from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support

def compute_metrics_bc(eval_pred):
    """
    Compute metrics for BC evaluation.
    Includes ROC AUC, Accuracy, Precision, Recall, and F1-Score.
    """
    logits, labels = eval_pred
    # Convert logits to probabilities for the positive class (index 1)
    probs = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
    labels = labels.numpy()

    print(f"[DEBUG] BC - Labels: {labels}")
    print(f"[DEBUG] BC - Probabilities: {probs}")

    # Compute metrics
    try:
        roc_auc = roc_auc_score(labels, probs)
    except ValueError:
        roc_auc = 'N/A'  # Handle cases where ROC AUC cannot be computed

    predicted_labels = (probs >= 0.5).astype(int)
    accuracy = accuracy_score(labels, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='binary')

    print(f"[DEBUG] BC - ROC AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")

    return {
        'roc_auc': roc_auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,  # Changed key to 'f1_score' for consistency
    }


In [None]:
from transformers import Trainer

# NTP Trainer
ntp_trainer = Trainer(
    model=model,  # GPT-J-6B with LoRA and quantization
    args=ntp_training_args,
    train_dataset=ntp_train_tokenized,
    eval_dataset=ntp_val_tokenized,
    data_collator=ntp_data_collator
    #compute_metrics=compute_metrics_ntp
)

# BC Trainer with custom loss
bc_trainer = BinaryClassificationTrainer(
    model=model,  # GPT-J-6B with LoRA and quantization
    args=bc_training_args,
    train_dataset=bc_train_tokenized,
    eval_dataset=bc_val_tokenized,
    data_collator=lambda data: {
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.tensor([f['label'] for f in data])
    }
    #compute_metrics=compute_metrics_bc
)

In [None]:
#finetune the model using both NTP and BC loss functions
#finetune with next token prediction loss function
print("=== Starting Fine-Tuning with Next Token Prediction (NTP) ===")
ntp_trainer.train()
ntp_trainer.save_model("./gptj_finetuned_ntp")
print("=== Fine-Tuning with NTP Completed ===")