# Bug Localization

In [1]:
!pip install datasets transformers sentencepiece



In [2]:
import os

KAGGLE_KERNEL_INTERACTIVE = os.environ["KAGGLE_KERNEL_RUN_TYPE"] == "Interactive"

if KAGGLE_KERNEL_INTERACTIVE:
    !pip install huggingface_hub
    !apt install git-lfs
    !git lfs install

    from huggingface_hub import notebook_login

    notebook_login()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 75 not upgraded.
Need to get 3316 kB of archives.
After this operation, 11.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]
Fetched 3316 kB in 1s (2267 kB/s)  [0m33m[33m[33m

7[0;23r8[1ASelecting previously unselected package git-lfs.
(Reading database ... 103274 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m[30mProgress: [ 20%][49m[39m [###########...............................................] 8Unpacking git-lfs (2.9.2-1) ...
7[24;0f[42m[30mProgress: [ 40%][49m[39m [#######################.........................

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
import os
import json
import torch

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import load_metric, load_dataset

from difflib import SequenceMatcher
from tqdm.notebook import tqdm
from IPython.display import HTML
from functools import partial

pd.set_option('max_columns', None)

codenet_root = '/kaggle/input/java-localization/java_localization/'

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Preprocess Data

In [12]:
dataset = load_dataset("json", data_files={"train": codenet_root+"java_train.json", "test": codenet_root+"java_test.json"}, field='data')

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-5d3950c02eb04f44/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-5d3950c02eb04f44/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
train_dataset = dataset["train"].filter(lambda example: example["returncode"] != 0).train_test_split(test_size=0.1)
test_dataset = dataset["test"]

  0%|          | 0/8 [00:00<?, ?ba/s]

### Label Tokens


In [16]:
def generate_char_mask(original_src, changed_src):
    s = SequenceMatcher(None, original_src, changed_src)
    opcodes = [x for x in s.get_opcodes() if x[0] != "equal"]
    
    original_labels = np.zeros_like(list(original_src), dtype=np.int32)
    for _, i1, i2, _, _ in opcodes:
        original_labels[i1: max(i1+1, i2)] = 1

    return original_labels.tolist()

def tokenize_and_align_labels(tokenizer, example):
    example = {
        "original_src": example["original_src"] + example["changed_src"], 
        "changed_src": example["changed_src"] + example["changed_src"], 
        "error_class_extra": example["error_class_extra"] + ["Accepted" for _ in example["changed_src"]]
    }
    
    y = [generate_char_mask(x_o, x_c) for (x_o, x_c) in zip(example["original_src"], example["changed_src"])]
    X_tokenized = tokenizer(text=example["error_class_extra"], text_pair=example["original_src"], padding=True, truncation=True)
    
    labels = np.zeros_like(X_tokenized.input_ids, dtype=np.int32) - 100
    for i, y_i in enumerate(y):
        for j, y_i_j in enumerate(y_i):
            idx = X_tokenized.char_to_token(i, j, sequence_index=1)
            if idx is None:
                continue
            if labels[i, idx] == -100:
                labels[i, idx] = y_i_j
            else:
                labels[i, idx] |= y_i_j
            
    X_tokenized["labels"] = labels.tolist()
    return X_tokenized

tokenizer = RobertaTokenizerFast.from_pretrained("microsoft/codebert-base")

train_dataset = train_dataset.map(partial(tokenize_and_align_labels, tokenizer), batched=True, batch_size=4, remove_columns=train_dataset["train"].column_names)
test_dataset = test_dataset.map(partial(tokenize_and_align_labels, tokenizer), batched=True, batch_size=4, remove_columns=test_dataset.column_names)

  0%|          | 0/1679 [00:00<?, ?ba/s]

  0%|          | 0/187 [00:00<?, ?ba/s]

  0%|          | 0/208 [00:00<?, ?ba/s]

## Train



In [7]:
training_args = TrainingArguments(
    output_dir='codebert-base-buggy-token-classification',          # output directory
    num_train_epochs=3,                                             # total number of training epochs
    per_device_train_batch_size=4,                                  # batch size per device during training
    per_device_eval_batch_size=4,                                   # batch size for evaluation
    warmup_steps=500,                                               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                              # strength of weight decay
    logging_dir='./logs',                                           # directory for storing logs
    logging_steps=1_000,                                            # Steps to report the loss value
    save_strategy ="no",
    push_to_hub=KAGGLE_KERNEL_INTERACTIVE,
)

model = RobertaForTokenClassification.from_pretrained("microsoft/codeT5-base")
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def document_level_metrics(true_predictions, true_labels):
    tp = 0
    fp = 0
    fn = 0
    tn = 0

    for t_pred, t_label in zip(true_predictions, true_labels):
        ref_accepted = 1 in t_pred
        p_accepted = 1 in t_label

        if ref_accepted and p_accepted:
            tp += 1
        if not ref_accepted and p_accepted:
            fp += 1
        if ref_accepted and not p_accepted:
            fn += 1
        if not ref_accepted and not p_accepted:
            tn += 1

    return {
        "document_precision": tp / (tp + fp) if (tp + fp) != 0 else 0,
        "document_recall": tp / (tp + fn) if (tp + fn) != 0 else 0,
        "document_f1": (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0,
        "document_accuracy": (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) != 0 else 0,
    }
            

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictionss = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labelss = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_predictions = [p for pred in true_predictionss for p in pred]
    true_labels = [p for pred in true_labelss for p in pred]

    
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
        **document_level_metrics(true_predictionss, true_labelss)
    }

In [11]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset["train"],         
    eval_dataset=train_dataset["test"],            
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Cloning https://huggingface.co/sefemna/codebert-base-buggy-token-classification into local empty directory.


Download file pytorch_model.bin:   0%|          | 14.7k/473M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

Clean file training_args.bin:  34%|###4      | 1.00k/2.92k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/473M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 13428
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10071


Step,Training Loss
1000,0.299
2000,0.2395
3000,0.2518
4000,0.2143
5000,0.1985
6000,0.1874
7000,0.1682
8000,0.1365
9000,0.128
10000,0.1264




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10071, training_loss=0.1946660041714387, metrics={'train_runtime': 2612.5249, 'train_samples_per_second': 15.42, 'train_steps_per_second': 3.855, 'total_flos': 1.052573763774072e+16, 'train_loss': 0.1946660041714387, 'epoch': 3.0})

In [12]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1494
  Batch size = 4


{'eval_loss': 0.22487878799438477,
 'eval_precision': 0.8139333855587672,
 'eval_recall': 0.6257866206982492,
 'eval_f1': 0.7075662220552321,
 'eval_accuracy': 0.9263208029985238,
 'eval_document_precision': 0.8404558404558404,
 'eval_document_recall': 0.9816971713810316,
 'eval_document_f1': 0.9056024558710668,
 'eval_document_accuracy': 0.9176706827309237,
 'eval_runtime': 32.9536,
 'eval_samples_per_second': 45.336,
 'eval_steps_per_second': 11.349,
 'epoch': 3.0}

In [13]:
model.save_pretrained('/kaggle/working/codet5-base-buggy-error-description')
tokenizer.save_pretrained('/kaggle/working/codet5-base-buggy-error-description')

Configuration saved in /kaggle/working/codet5-base-buggy-error-description/config.json
Model weights saved in /kaggle/working/codet5-base-buggy-error-description/pytorch_model.bin
tokenizer config file saved in /kaggle/working/codet5-base-buggy-error-description/tokenizer_config.json
Special tokens file saved in /kaggle/working/codet5-base-buggy-error-description/special_tokens_map.json


('/kaggle/working/codet5-base-buggy-error-description/tokenizer_config.json',
 '/kaggle/working/codet5-base-buggy-error-description/special_tokens_map.json',
 '/kaggle/working/codet5-base-buggy-error-description/vocab.json',
 '/kaggle/working/codet5-base-buggy-error-description/merges.txt',
 '/kaggle/working/codet5-base-buggy-error-description/added_tokens.json',
 '/kaggle/working/codet5-base-buggy-error-description/tokenizer.json')

In [18]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 13498
  Batch size = 4


{'eval_loss': 0.14982274174690247,
 'eval_precision': 0.8040890908192997,
 'eval_recall': 0.32216385203652587,
 'eval_f1': 0.4600182232989822,
 'eval_accuracy': 0.9486452445008716,
 'eval_document_precision': 0.7526340619594276,
 'eval_document_recall': 0.9908902691511388,
 'eval_document_f1': 0.8554830637232996,
 'eval_document_accuracy': 0.880204474736998,
 'eval_runtime': 167.8585,
 'eval_samples_per_second': 80.413,
 'eval_steps_per_second': 20.106,
 'epoch': 3.0}

In [19]:
if KAGGLE_KERNEL_INTERACTIVE:
    trainer.push_to_hub()

Saving model checkpoint to codebert-base-buggy-token-classification
Configuration saved in codebert-base-buggy-token-classification/config.json
Model weights saved in codebert-base-buggy-token-classification/pytorch_model.bin
tokenizer config file saved in codebert-base-buggy-token-classification/tokenizer_config.json
Special tokens file saved in codebert-base-buggy-token-classification/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 1.00/473M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/2.92k [00:00<?, ?B/s]

To https://huggingface.co/sefemna/codebert-base-buggy-token-classification
   3591723..49de560  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.8040890908192997}, {'name': 'Recall', 'type': 'recall', 'value': 0.32216385203652587}, {'name': 'F1', 'type': 'f1', 'value': 0.4600182232989822}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9486452445008716}]}
To https://huggingface.co/sefemna/codebert-base-buggy-token-classification
   49de560..46dc4ba  main -> main



In [8]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
model_path = '/kaggle/input/java_local/pytorch/default/1/java_local_model'
model = RobertaForTokenClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizerFast.from_pretrained("microsoft/codebert-base")

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

## Inference



In [18]:
def predict(tokenizer, model, error, source):
    if not isinstance(source, list):
        source = [source]
        error = [error]
    
    tokenized_inputs = tokenizer(text=error, text_pair=source, padding=True, truncation=True, return_tensors="pt").to(model.device)
    tokenized_labels = np.argmax(model(**tokenized_inputs)['logits'].cpu().detach().numpy(), 2)
    
    all_labels = []
    for i in range(tokenized_labels.shape[0]):
        labels = [0] * len(source[i])
        for j, label in enumerate(tokenized_labels[i]):
            if tokenized_inputs.token_to_sequence(i, j) != 1:
                continue

            word_id = tokenized_inputs.token_to_word(i, j)
            cs = tokenized_inputs.word_to_chars(i, word_id, sequence_index=1)
            if cs.start == cs.end:
                continue
            labels[cs.start:cs.end] |= tokenized_labels[i, j]
        
        all_labels.append(labels)
    
    return all_labels

def color_source(source_code, mask, color='red'):
    text = ""
    for i, char in enumerate(source_code):
        norm_color = 'black'
        if char == ' ':
            char = "•"
            norm_color = 'lightgrey'
        if char == '\n':
            char = "↵\n"
            norm_color = 'lightgrey'
        text += f'<span style="color:{color if mask[i] == 1 else norm_color};">{char}</span>'
    return "<pre>" + text + "</pre>"

def display_example(source_code, error_class_extra, mask):
    display(HTML("<h2>The source code that is predicted buggy:\n</h2>"))
    display(HTML(color_source(source_code, mask, color='red')))
    
    display(HTML("<h2>The bug assigned to the source code:\n</h2>"))
    display(HTML(f"<pre>{error_class_extra}</pre>"))


viz_data = dataset["test"]

source_code = """private static Date parseUsingMask(final String[] masks, String sDate) {
    sDate = sDate != null ? sDate.trim() : null;
    ParsePosition pp = null;
    Date d = null;
    for (int i = 0; d == null && i < masks.length; i++) {
        final DateFormat df = new SimpleDateFormat(masks[i], Locale.US);
        // df.setLenient(false);
        df.setLenient(true);
        try {
            pp = new ParsePosition(0);
            d = df.parse(sDate, pp);
            if (pp.getIndex() != sDate.length()) {
                d = null;
            }
            // System.out.println("pp[" + pp.getIndex() + "] s[" + sDate + " m[" + masks[i] + "] d[" + d + "]");
        } catch (final Exception ex1) {
            // System.out.println("s: " + sDate + " m: " + masks[i] + " d: " + null);
        }
    }
    return d;
}"""
error_class_extra ="NULL_DEREFERENCE"
mask = predict(tokenizer, model, error_class_extra, source_code)[0]
display(HTML(f"<h1>Example {i}</h1>"))
display_example(source_code, error_class_extra, mask)