<a href="https://colab.research.google.com/github/flaviusfetean/method_name_predictor/blob/main/nlp_codet5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate peft bitsandbytes transformers trl

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
!pip install pytorch-lightning

In [4]:
from datasets import load_dataset
import os
methods_dataset = load_dataset("json", data_files={"train":"json_clean_train.json", "test": "json_clean_test.json"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
print(methods_dataset)

DatasetDict({
    train: Dataset({
        features: ['name', 'body', 'class'],
        num_rows: 99991
    })
    test: Dataset({
        features: ['name', 'body', 'class'],
        num_rows: 33331
    })
})


In [6]:
import random

def get_input_text_from_dict(input_dict, add_details=False):
    rand_seed = random.randint(0, 10)
    input_text = ""
    # only teach the model along with additional details in 70% of the cases, letting
    if rand_seed < 7 or add_details:
        input_text += f"<s>class: {input_dict['class']}</s>" if input_dict.get('class', None) is not None else ""
    input_text += f"<s>body: {input_dict['body']}</s>";
    return input_text

In [7]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

max_input_length = 1024
max_target_length = 128

def convert_dataset(raw_dataset):
    # encode the body-name pairs
    classes = raw_dataset['class']
    bodies = raw_dataset['body']
    names = raw_dataset['name']

    model_inputs = tokenizer(bodies, max_length=max_input_length, padding="max_length", truncation=True)

    # encode the method names
    labels = tokenizer(names, max_length=max_target_length, padding="max_length", truncation=True).input_ids

    # important: we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [8]:
dataset = methods_dataset.map(convert_dataset, batched=True)

Map:   0%|          | 0/99991 [00:00<?, ? examples/s]

Map:   0%|          | 0/33331 [00:00<?, ? examples/s]

In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['name', 'body', 'class', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 99991
    })
    test: Dataset({
        features: ['name', 'body', 'class', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 33331
    })
})


In [33]:
#load the dataset in the dataloaders
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8)
#test_dataloader = DataLoader(dataset['test'], batch_size=2)

In [27]:
#example input
batch = next(iter(train_dataloader))
print(batch.keys())
tokenizer.decode(batch['input_ids'][0])

dict_keys(['input_ids', 'attention_mask', 'labels'])


'<s>return new TextFieldWithPopupHandlerUI.MouseDragAwareCaret();</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [28]:
#example label
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'<s>createCaret</s>'

In [34]:
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torch

class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=10, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/llama_output/codet5_method_predictor", device_map={"": 0})
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        torch.set_grad_enabled(True)
        loss = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                                     num_warmup_steps=self.hparams.warmup_steps,
                                                                     num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader


In [35]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

model = CodeT5()

lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(callbacks=[lr_monitor], max_epochs=5)
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-pa

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [36]:
model.model.save_pretrained('/content/drive/MyDrive/llama_output/codet5_method_predictor')

In [37]:
#load the checkpointed model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/llama_output/codet5_method_predictor",
                                                    device_map={"": 0})
original_model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small",  cache_dir=r"/content/drive/MyDrive/llama_output",
                                                                device_map={"": 0})

In [38]:
# prepare for the model
test_dict = methods_dataset['test'][22]
print(f"Inferencing for: {test_dict['body']}")
input_ids = tokenizer(test_dict['body'], return_tensors='pt').input_ids.to('cuda')
input_ids_orig = tokenizer("Generate a method name for the body: " + test_dict['body'], return_tensors='pt').input_ids.to('cuda')
# generate
outputs = model.generate(input_ids)
outputs_original = original_model.generate(input_ids_orig)
print("Generated name by the fine-tuned model:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("Generated name by the original model:", tokenizer.decode(outputs_original[0], skip_special_tokens=True))
print("Ground truth: ", test_dict['name'])

Inferencing for: setErrorMessage(errorMessage);




Generated name by the fine-tuned model: setErrorMessage
Generated name by the original model:  public void
Ground truth:  errorOccurred


#Testing the result model


1.   **Hard Comparison**:
  The output is tested for an exact match

In [39]:
def compare_outputs(pred, gt):
    return 1 if pred == gt else 0



2.   **Soft comparison (similarity)**: We will count the number of words that appear in both the output and the ground truth, as the output may still have relevance even if not an exact match

In [40]:
def split_camel_case(input_string):
    """
    Method to split a method name which is known to be a camel-case
    into its composing words (Java convention)
    """
    try:
        words = [input_string[0]]

        for char in input_string[1:]:
            if char.isupper():
                words.append(char.lower())
            else:
                words[-1] += char
    except IndexError:
        return ""

    return ' '.join(words)

camel_case_string = "camelCaseExample"
result = split_camel_case(camel_case_string)
print(result)

camel case example


In [41]:

def compare_similarity(pred, gt):
    """Often the method name is not predicted exactly the same as the ground truth
    But it is composed of some words that are also present in the ground truth
    Therefore, we will consider the similarity between the results as the number of words in the ground truth that are also present in the prediction divided by the maximum length of the two strings
    """

    max_similarity = 0
    words = split_camel_case(gt).split()
    for word in words:
        if word in pred.lower():
            max_similarity += len(word)

    return max_similarity / max(len(pred), len(gt))



3.   **ROUGE score**: A generalization of the soft score, will also take into account bigrams and longest-common-sequences



In [43]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install evaluate absl-py rouge_score nltk

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=91fa4f6fc2210a1f83ca89d133b64603f9e9efc072eb897c276ccec2e82e89b0
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0 rouge_score-0.1.2


In [44]:
import evaluate

rouge = evaluate.load('rouge')

def compare_rouge(preds, gts):
    """
    Rouge will treat the texts as summaries, so we will have to split
    the method names into composing words and treat them as summaries
    """
    pred_split = [split_camel_case(pred) for pred in preds]
    gt_split = [split_camel_case(gt) for gt in gts]

    return rouge.compute(predictions=pred_split, references=gt_split)

print(compare_rouge(["getTestDefault"], ["myTestNotDefault"])) #Expected 0.57 r1, 0 r2, 0.57 rl

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.5714285714285715, 'rouge2': 0.0, 'rougeL': 0.5714285714285715, 'rougeLsum': 0.5714285714285715}


In [45]:
#test the compare functions
print(compare_outputs("hello", "hello")) #Expected 1
print(compare_outputs("hello", "world")) #Expected 0
print(compare_similarity("hello", "hello")) #Expected 1.0
print(compare_similarity("hello", "hell")) #Expected 0.8
print(compare_similarity("hello", "helll")) #Expected 0.0
print(compare_similarity("getTestDefault", "myTestNotDefault")) #Expected 0.687

1
0
1.0
0.8
0.0
0.6875


Method to predict on all inputs, given a model and a dataset (dataset must be formatted for the model input)

In [51]:
from tqdm import tqdm

def predict_all(model, dataset, tokenizer, max_new_tokens=20):
    #turn of learning functions
    model.eval()
    torch.no_grad()

    #innitialize lists for predictions and ground truths that will be comppared
    predictions = []
    gt = []
    for i, example in enumerate(dataset):
        #prepare input
        input_ids = tokenizer(example['input'], return_tensors='pt').input_ids.to('cuda')
        # generate
        outputs = model.generate(input_ids, max_new_tokens=max_new_tokens)
        #decode predicate and add to list
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(pred)
        gt.append(example['label'])
        if i%1000 == 0:
            print(f"{i}/{len(dataset)}")
    return predictions, gt

Cell to evaluate predictions on soft, hard and rouge scores

In [47]:
#evaluate the predictions

def evaluate_predictions(predictions, gt):
    hard_score = 0
    soft_score = 0

    for i, (pred, gndt) in enumerate(zip(predictions, gt)):
        hard_score += compare_outputs(pred, gndt)
        soft_score += compare_similarity(pred, gndt)

    print("Hard score: ", hard_score / len(predictions))
    print("Soft score: ", soft_score / len(predictions))
    print("Rouge score: ", compare_rouge(predictions, gt))

Evaluate the fine-tuned model

In [52]:
#get all predictions
model_input = [{"input": example['body'], "label": example['name']} for example in methods_dataset['test']]
predictions, gt = predict_all(model, model_input, tokenizer)

0/33331
1000/33331
2000/33331


Token indices sequence length is longer than the specified maximum sequence length for this model (723 > 512). Running this sequence through the model will result in indexing errors


3000/33331
4000/33331
5000/33331
6000/33331
7000/33331
8000/33331
9000/33331
10000/33331
11000/33331
12000/33331
13000/33331
14000/33331
15000/33331
16000/33331
17000/33331
18000/33331
19000/33331
20000/33331
21000/33331
22000/33331
23000/33331
24000/33331
25000/33331
26000/33331
27000/33331
28000/33331
29000/33331
30000/33331
31000/33331
32000/33331
33000/33331


In [53]:
evaluate_predictions(predictions, gt)

[1;30;43mDatele de ieșire de afișat au fost trunchiate la ultimele 5000 linii.[0m
Ground truth:  dispose
Predicted:  isEnabledByDefault
Ground truth:  isWritable
Predicted:  documentChanged
Ground truth:  documentChanged
Predicted:  removeElement
Ground truth:  poll
Predicted:  addListener
Ground truth:  addRootSetChangedListener
Predicted:  isAvailable
Ground truth:  isAvailableOnElementInEditorAndFile
Predicted:  getTree
Ground truth:  createCenterPanel
Predicted:  matchAdjustmentDelimiters
Ground truth:  matchAdjustmentDelimiters
Predicted:  setSelectedModuleJdk
Ground truth:  reset
Predicted:  actionPerformed
Ground truth:  actionPerformed
Predicted:  isEnabledByDefault
Ground truth:  existsFieldWithName
Predicted:  getSubstitutor
Ground truth:  getSubstitutor
Predicted:  getCanonicalText
Ground truth:  getQualifiedName
Predicted:  isAllowedField
Ground truth:  hasSafeType
Predicted:  isEnabledByDefault
Ground truth:  isInComments
Predicted:  setHelpId
Ground truth:  setHelpId
Pr

Evaluate the original model

In [54]:
model_input = [{"input": "Generate a name for the method having the following body: "+example['body'], "label": example['name']} for example in methods_dataset['test']]
predictions_orig, gt = predict_all(original_model, model_input, tokenizer)

0/33331
1000/33331
2000/33331
3000/33331
4000/33331
5000/33331
6000/33331
7000/33331
8000/33331
9000/33331
10000/33331
11000/33331
12000/33331
13000/33331
14000/33331
15000/33331
16000/33331
17000/33331
18000/33331
19000/33331
20000/33331
21000/33331
22000/33331
23000/33331
24000/33331
25000/33331
26000/33331
27000/33331
28000/33331
29000/33331
30000/33331
31000/33331
32000/33331
33000/33331


In [55]:
evaluate_predictions(predictions_orig, gt)

[1;30;43mDatele de ieșire de afișat au fost trunchiate la ultimele 5000 linii.[0m
Predicted:  method
Ground truth:  isTabbingModeAvailable
Predicted:  function
Ground truth:  getClasspathEntry
Predicted:   def
Ground truth:  mergeDistinctPairs
Predicted:  }
Ground truth:  getWordSelectionRange
Predicted:  return;
Ground truth:  actionPerformed
Predicted:  
Ground truth:  findVersion
Predicted:  
Ground truth:  getUniqueId
Predicted:  incrButton.getHeight();
Ground truth:  getIncrementButtonHeight
Predicted:  myNestedResult;
Ground truth:  getNestedResult
Predicted:  =
Ground truth:  merge
Predicted:  method
Ground truth:  context
Predicted:   public static
Ground truth:  hyperlinkActivated
Predicted:  myTextBg
Ground truth:  setTextBg
Predicted:  myMayOverflow;
Ground truth:  mayOverflow
Predicted:   def
Ground truth:  addFilesTo
Predicted:  .
Ground truth:  toString
Predicted:  .
Ground truth:  enableToolsByDefault
Predicted:  
Ground truth:  beDisabled
Predicted:  class
Ground trut