In [1]:
# Install necessary libraries
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install sentencepiece

import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

# Load your generated dataset
# Replace 'your_dataset_path' with the actual path to your dataset
dataset_path = 'data/analyzed_dataset.jsonl'

# Load data from the JSONL file
with open(dataset_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Extract input and target values
magic_number_smells = [item['magic_number_smell'] for item in data]
refactored_codes = [item['refactored_code'] for item in data]

# Split the dataset into training and testing sets
train_magic_number_smells, test_magic_number_smells, train_refactored_codes, test_refactored_codes = train_test_split(
    magic_number_smells, refactored_codes, test_size=0.2, random_state=42
)

# Create dictionaries for training and testing datasets
train_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(train_magic_number_smells, train_refactored_codes)]
test_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(test_magic_number_smells, test_refactored_codes)]

# Save the datasets to JSONL files
train_file_path = 'data/train_dataset.jsonl'
test_file_path = 'data/test_dataset.jsonl'

with open(train_file_path, 'w') as f:
    for item in train_dataset:
        f.write(json.dumps(item) + '\n')

with open(test_file_path, 'w') as f:
    for item in test_dataset:
        f.write(json.dumps(item) + '\n')

# Define a custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        magic_number_smell = item['magic_number_smell']
        refactored_code = item['refactored_code']

        # Tokenize and convert to PyTorch tensors
        inputs = self.tokenizer.encode_plus(magic_number_smell, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        targets = self.tokenizer.encode_plus(refactored_code, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
        }

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create datasets and dataloaders
train_dataset = CodeDataset(train_dataset, tokenizer)
test_dataset = CodeDataset(test_dataset, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Save checkpoint after each epoch
        checkpoint_path = f'magic_smell_model_checkpoint_epoch_{epoch + 1}.pth'
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss.item(),
        }, checkpoint_path)


# Save the trained model
model.save_pretrained('magic_smell_model')

# Testing the model
model.eval()
references = []  # List to store reference sequences
predictions = []  # List to store predicted sequences

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Evaluating on Test Dataset'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Generate predictions
        predicted_ids = model.generate(**inputs, max_length=512)
        predicted_code = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

        # Append to reference and prediction lists
        references.append(labels.cpu().numpy())
        predictions.append(predicted_ids[0].cpu().numpy())

# Flatten lists for precision_recall_fscore_support
references = [item for sublist in references for item in sublist]
predictions = [item for sublist in predictions for item in sublist]

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='micro')

print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

In [None]:
print(len(all_references))
print(len(all_predictions))


112640
66693


In [None]:
# Convert labels to string or integer
all_references = [str(label) for label in all_references]
all_predictions = [str(label) for label in all_predictions]

# Now, you can proceed with the evaluation
precision, recall, f1, _ = precision_recall_fscore_support(all_references, all_predictions, average=None)

# Print results for each class
for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
    print(f"Class {i}: Precision={p:.4f}, Recall={r:.4f}, F1 Score={f:.4f}")


Class 0: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 1: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 2: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 3: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 4: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 5: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 6: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 7: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 8: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 9: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 10: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 11: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 12: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 13: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 14: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 15: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 16: Precision=0.8298, Recall=0.0069, F1 Score=0.0137
Class 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
# Install necessary libraries
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install sentencepiece

import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

# Load your generated dataset
# Replace 'your_dataset_path' with the actual path to your dataset
dataset_path = 'data/analyzed_dataset.jsonl'

# Load data from the JSONL file
with open(dataset_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Extract input and target values
magic_number_smells = [item['magic_number_smell'] for item in data]
refactored_codes = [item['refactored_code'] for item in data]

# Split the dataset into training and testing sets
train_magic_number_smells, test_magic_number_smells, train_refactored_codes, test_refactored_codes = train_test_split(
    magic_number_smells, refactored_codes, test_size=0.2, random_state=42
)

# Create dictionaries for training and testing datasets
train_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(train_magic_number_smells, train_refactored_codes)]
test_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(test_magic_number_smells, test_refactored_codes)]

# Save the datasets to JSONL files
train_file_path = 'data/train_dataset.jsonl'
test_file_path = 'data/test_dataset.jsonl'

with open(train_file_path, 'w') as f:
    for item in train_dataset:
        f.write(json.dumps(item) + '\n')

with open(test_file_path, 'w') as f:
    for item in test_dataset:
        f.write(json.dumps(item) + '\n')

# Define a custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        magic_number_smell = item['magic_number_smell']
        refactored_code = item['refactored_code']

        # Tokenize and convert to PyTorch tensors
        inputs = self.tokenizer.encode_plus(magic_number_smell, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        targets = self.tokenizer.encode_plus(refactored_code, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
        }

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create datasets and dataloaders
train_dataset = CodeDataset(train_dataset, tokenizer)
test_dataset = CodeDataset(test_dataset, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-4)

# Loading onto processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
print(refactored_codes)



In [None]:
# Training loop
num_epochs = 5
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Save checkpoint after each epoch
        checkpoint_path = f'magic_smell_model_checkpoint_epoch_{epoch + 1}.pth'
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss.item(),
        }, checkpoint_path)


# Save the trained model
model.save_pretrained('magic_smell_model')

In [None]:
from transformers import T5ForConditionalGeneration

# Initialize the T5 tokenizer and model
model = T5ForConditionalGeneration.from_pretrained('magic_smell_model')

In [8]:
# Testing the model
model.eval()
all_references = []  # List to store reference sequences
all_predictions = []  # List to store predicted sequences
all_prediction_ids = []
all_prediction_ids_labelled = []
all_predictions_decoded = []
all_predictions_decoded_labelled = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Evaluating on Test Dataset'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Generate predictions
        predicted_ids = model.generate(**inputs, max_length=512)
        predicted_code = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_ids]

        # Append to reference and prediction lists
        all_references.extend(labels.cpu().numpy())
        all_predictions.extend(predicted_code)
        
        all_prediction_ids.extend(predicted_ids)
        all_prediction_ids_labelled.extend(predicted_ids.cpu().numpy())

        tokenized_predicted_code = [tokenizer.encode_plus(code, return_tensors='pt', padding='max_length', truncation=True, max_length=512) for code in predicted_code]
        all_predictions_decoded.extend(tokenized_predicted_code)
        labels_predicted = torch.stack([item['input_ids'].squeeze() for item in tokenized_predicted_code])
        # all_predictions_decoded_labelled.extend(labels_predicted.cpu.numpy())
        all_predictions_decoded_labelled.extend(labels_predicted.numpy())
        
        
# Save the results to a text file
with open('test_results.txt', 'w') as file:
    for reference, prediction in zip(all_references, all_predictions):
        file.write(f"Reference: {reference}\n")
        file.write(f"Prediction: {prediction}\n\n")

# # Flatten lists for precision_recall_fscore_support
# all_references = [item for sublist in all_references for item in sublist]
# all_predictions = [item for sublist in all_predictions for item in sublist]


Evaluating on Test Dataset: 100%|██████████| 70/70 [08:28<00:00,  7.26s/it]


In [None]:
from difflib import SequenceMatcher

# Assuming all_references and all_prediction_ids are lists of sequences

# Function to calculate similarity percentage using SequenceMatcher
def similarity_percentage(reference, prediction):
    matcher = SequenceMatcher(None, reference, prediction)
    return matcher.ratio() * 100

def similarity_percentage_two(reference, prediction):
    # Extract text content from tokenized outputs
    reference_text = " ".join(str(token.get("text", "")) if isinstance(token, dict) else str(token) for token in reference)
    prediction_text = " ".join(str(token.get("text", "")) if isinstance(token, dict) else str(token) for token in prediction)

    # Calculate similarity
    matcher = SequenceMatcher(None, reference_text, prediction_text)
    return matcher.ratio() * 100

# Iterate over pairs of reference and prediction
num_pairs = min(len(all_references), len(all_predictions))
total_similarity = 0
for reference, prediction in zip(all_references, all_predictions):
    similarity_percent = similarity_percentage(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
    total_similarity += similarity_percent
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


num_pairs = min(len(all_references), len(all_prediction_ids))
total_similarity = 0
for reference, prediction in zip(all_references, all_prediction_ids):
    similarity_percent = similarity_percentage(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
    total_similarity += similarity_percent
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


num_pairs = min(len(all_references), len(all_prediction_ids_labelled))
total_similarity = 0
for reference, prediction in zip(all_references, all_prediction_ids_labelled):
    similarity_percent = similarity_percentage(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
    total_similarity += similarity_percent
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


num_pairs = min(len(all_references), len(all_predictions_decoded))
total_similarity = 0
for reference, prediction in zip(all_references, all_predictions_decoded):
    similarity_percent = similarity_percentage_two(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
    total_similarity += similarity_percent
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


num_pairs = min(len(all_references), len(labels_predicted))
total_similarity = 0
for reference, prediction in zip(all_references, labels_predicted):
    similarity_percent = similarity_percentage_two(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
    total_similarity += similarity_percent
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


num_pairs = min(len(all_references), len(all_predictions_decoded_labelled))
total_similarity = 0
for reference, prediction in zip(all_references, all_predictions_decoded_labelled):
    similarity_percent = similarity_percentage_two(reference, prediction)
    # print(f"Similarity Percentage: {similarity_percent:.2f}%")
mean_similarity = total_similarity / num_pairs
print(f"\nOverall Similarity Percentage: {mean_similarity:.2f}%")


# print("All References: ", all_references)
# print("All Predictions: ", all_predictions)
# print("All Prediction IDs: ", all_prediction_ids)
# print("All Prediction IDs Labelled: ", all_prediction_ids_labelled)
# print("All Predictions Decoded: ", all_predictions_decoded)
# # print("Labels Predicted: ", labels_predicted)
# print("All Predictions Decoded Labelled: ", all_predictions_decoded_labelled)


Overall Similarity Percentage: 0.00%

Overall Similarity Percentage: 0.00%

Overall Similarity Percentage: 22.19%

Overall Similarity Percentage: 0.15%

Overall Similarity Percentage: 1.90%

Overall Similarity Percentage: 0.00%


In [None]:
# Convert labels to string or integer
all_references = [str(label) for label in all_references]
all_predictions = [str(label) for label in all_predictions]

# Now, you can proceed with the evaluation
precision, recall, f1, _ = precision_recall_fscore_support(all_references, all_predictions, average=None)

# Print results for each class
for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
    print(f"Class {i}: Precision={p:.4f}, Recall={r:.4f}, F1 Score={f:.4f}")

Class 0: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 1: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 2: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 3: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 4: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 5: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 6: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 7: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 8: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 9: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 10: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 11: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 12: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 13: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 14: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 15: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 16: Precision=0.0000, Recall=0.0000, F1 Score=0.0000
Class 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Read test results from the file
all_references = []
all_predictions = []

with open('test_results.txt', 'r') as file:
    lines = file.readlines()
    
    reference_started = False
    prediction_started = False
    
    for line in lines:
        line = line.strip()
        
        if line.startswith("Reference:"):
            reference_started = True
            prediction_started = False
            reference = line.split(': ')[1]
        elif line.startswith("Prediction:"):
            reference_started = False
            prediction_started = True
            prediction = line.split(': ')[1]
        
        if reference_started:
            all_references.append(reference)
        elif prediction_started:
            all_predictions.append(prediction)


In [None]:
# pip3 install rouge_score sacremoses sacrebleu nltk.translate
import sacrebleu
from sacremoses import MosesDetokenizer

# Initialize detokenizer
detokenizer = MosesDetokenizer()

# Check if the lists are not empty
if all_prediction_ids_labelled and all_references:
    bleu = sacrebleu.corpus_bleu(all_prediction_ids_labelled, [all_references])
    print(f"BLEU: {bleu.score}")
else:
    print("Error: Empty prediction or reference list.")


In [10]:
!pip3 install rouge_score

from rouge_score import rouge_scorer

# Check if the lists are not empty
if all_predictions and refactored_codes:
    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(detokenizer.detokenize(all_predictions), detokenizer.detokenize(refactored_codes))
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure}")
else:
    print("Error: Empty prediction or reference list.")

Defaulting to user installation because normal site-packages is not writeable
ROUGE-1: 0.07853441894892671
ROUGE-2: 0.05100303501369457
ROUGE-L: 0.03908216136195411


In [9]:
import nltk
from nltk.translate import meteor_score

# Download WordNet data
nltk.download('wordnet')

# Check if the lists are not empty
if refactored_codes and all_predictions:
    # Convert NumPy arrays to strings
    hypothesis_strings = [str(pred) for pred in all_predictions]

    # Preprocess references by converting to strings
    references_strings = []
    for ref in refactored_codes:
        ref_strings = [str(token) for token in ref]
        references_strings.append(ref_strings)

    meteor_avg_score = meteor_score.meteor_score(references_strings, hypothesis_strings)
    print(f"METEOR: {meteor_avg_score}")
else:
    print("Error: Empty prediction or reference list.")


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/daredevil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


METEOR: 0.0


In [None]:
print("References: ")
print(all_references)
print("Predictions: ")
print(all_predictions)

References: 
['[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196', '[  452 14491   804    16    17   454 19356  9041 21613   834  8742   196'

In [4]:
import sacrebleu

# Check if the lists are not empty
if all_predictions and refactored_codes:
    # Convert NumPy arrays to Python lists of strings
    references = [str(ref) for ref in refactored_codes]
    predictions = [str(pred) for pred in all_predictions]

    # Calculate CodeBLEU
    codebleu = sacrebleu.corpus_bleu(predictions, [references])
    print(f"CodeBLEU: {codebleu.score}")
    print(refactored_codes)
    print(all_predictions)
else:
    print("Error: Empty prediction or reference list.")


CodeBLEU: 1.3927940507717431
['mBluetoothTethered = info.isConnected()? true', 'mBluetoothTethered = info.isConnected()? true', 'GC!); BinderInternal.forceGc("bg");  //Slog.i(TAG, "**** WE MIGHT WANT TO GC', 'GC!); BinderInternal.forceGc("bg");  //Slog.i(TAG, "**** WE MIGHT WANT TO GC', 'return getInt("rate");  return 5;  /**', 'return getInt("rate");  return 5;  /**', 'IOException  final int version = in.readInt(); switch (version)  case 1', 'IOException  final int version = in.readInt(); switch (version)  case 1', '// track and wait for the wire Component String filterString = "(" + ConfigurationService.KURA_SERVICE_PID + "=" + temporaryName + ")"; ServiceUtil.waitForService(filterString, 60, TimeUnit.SECONDS); comp = createMetatypeOnlyGwtComponentConfiguration(temporaryName); comp = createMetatypeOnlyGwtComponentConfiguration(temporaryName); comp = createMeta', '// track and wait for the wire Component String filterString = "(" + ConfigurationService.KURA_SERVICE_PID + "=" + tempora