In [None]:
import json

import torch
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, matthews_corrcoef, \
    cohen_kappa_score
from torch import nn
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import (AdamW, get_linear_schedule_with_warmup,
                          RobertaTokenizer, RobertaModel)
from src.evalution import evaluate_result


In [None]:
class CodeDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        self.examples = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line.strip())
                code_tokens = tokenizer.tokenize(data["code_no_comment"])[:block_size - 2]
                input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + code_tokens + [tokenizer.eos_token])
                padding_length = block_size - len(input_ids)
                input_ids += [tokenizer.pad_token_id] * padding_length
                self.examples.append((torch.tensor(input_ids), torch.tensor(data["label"])))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]


In [None]:
class CodeBERTModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.encoder = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, labels=None):
        outputs = self.encoder(input_ids, attention_mask=input_ids.ne(1))[0]
        cls_output = self.dropout(outputs[:, 0, :])
        logits = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return loss, logits


In [None]:
def train_model(train_dataset, model, tokenizer, args):
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=args["batch_size"])
    optimizer = AdamW(model.parameters(), lr=args["lr"], eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                num_training_steps=len(train_dataloader) * args["epochs"])
    model.train()

    for epoch in range(args["epochs"]):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
            input_ids, labels = [b.to(args["device"]) for b in batch]
            optimizer.zero_grad()
            loss, _ = model(input_ids, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_dataloader)}")
    return model


In [None]:
def evaluate_model(eval_dataset, model, args):
    eval_dataloader = DataLoader(eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=args["batch_size"])
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids, labels = [b.to(args["device"]) for b in batch]
            _, logits = model(input_ids)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    metrics = evaluate_result(all_labels, all_preds, None)
    print(metrics)

In [None]:
def main():
    DATA = "/../../datasets"
    args = {
        "model_name": "microsoft/codebert-base",
        "train_file": f"{DATA}/data/train_scaled.jsonl",
        "eval_file": f"{DATA}/data/test_scaled.jsonl",
        "block_size": 512,
        "batch_size": 16,
        "epochs": 20,
        "lr": 2e-5,
        "device": "cuda" if torch.cuda.is_available() else "cpu"
    }
    tokenizer = RobertaTokenizer.from_pretrained(args["model_name"])
    train_dataset = CodeDataset(args["train_file"], tokenizer, args["block_size"])
    eval_dataset = CodeDataset(args["eval_file"], tokenizer, args["block_size"])
    model = CodeBERTModel(args["model_name"], num_labels=4).to(args["device"])
    model = train_model(train_dataset, model, tokenizer, args)
    evaluate_model(eval_dataset, model, args)
    torch.save(model.state_dict(), "codebert_finetuned.bin")


if __name__ == "__main__":
    main()


Epoch 1: 100%|██████████| 151/151 [03:48<00:00,  1.51s/it]


Epoch 1 Loss: 0.9959236851196416


Epoch 2: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 2 Loss: 0.7903857144298932


Epoch 3: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 3 Loss: 0.6280412535793733


Epoch 4: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 4 Loss: 0.4865910896402321


Epoch 5: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 5 Loss: 0.31222406984461065


Epoch 6: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 6 Loss: 0.21255303164388958


Epoch 7: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 7 Loss: 0.1428002743852257


Epoch 8: 100%|██████████| 151/151 [03:46<00:00,  1.50s/it]


Epoch 8 Loss: 0.11299956439136966


Epoch 9: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 9 Loss: 0.08888575891094493


Epoch 10: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 10 Loss: 0.06605189614034943


Epoch 11: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 11 Loss: 0.053451008340713896


Epoch 12: 100%|██████████| 151/151 [03:47<00:00,  1.51s/it]


Epoch 12 Loss: 0.0432295185584723


Epoch 13: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 13 Loss: 0.035073446555507204


Epoch 14: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 14 Loss: 0.029018199578581365


Epoch 15: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 15 Loss: 0.02660638009913463


Epoch 16: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 16 Loss: 0.026716479555154774


Epoch 17: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 17 Loss: 0.022253247352613026


Epoch 18: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 18 Loss: 0.0208825362770701


Epoch 19: 100%|██████████| 151/151 [03:46<00:00,  1.50s/it]


Epoch 19 Loss: 0.0172816794539111


Epoch 20: 100%|██████████| 151/151 [03:47<00:00,  1.50s/it]


Epoch 20 Loss: 0.01422092974855139
{'eval_f1': 0.7268955911596828, 'eval_f1_perclass': [0.6136363636363636, 0.8111455108359134, 0.9361702127659575, 0.4659090909090909], 'eval_acc': 0.7450199203187251, 'eval_precision': 0.7355248824396095, 'eval_recall': 0.7450199203187251, 'eval_ROC-UAC': 0.0, 'eval_mcc': 0.5465303727366377, 'eval_cohen_kappa_score': 0.5318733516923839, 'eval_gmean': 0.7377535169269442}
