1. Load Model
2. Test Model
3. Print Raw Results i.e generated refactorings
4. CodeBLEU Evaluation
5. ROGUE1, ROGUE2 and ROGUE-LCS Evaluation
6. METEOR Evaluation
7. Metrics Visualization

In [None]:
# Load model - used for training and testing in different sessions

# Run prep cell first

from transformers import T5ForConditionalGeneration

# Initialize the T5 tokenizer and model
model = T5ForConditionalGeneration.from_pretrained('magic_smell_model_s_3lines_700_e40_b4')

In [None]:
# Testing the model

model.eval()
all_references = []  # List to store reference sequences
all_predictions = []  # List to store predicted sequences
all_prediction_ids = []
all_prediction_ids_labelled = []
all_predictions_decoded = []
all_predictions_decoded_labelled = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Evaluating on Test Dataset'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Generate predictions
        predicted_ids = model.generate(**inputs, max_length=512)
        predicted_code = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_ids]

        # Append to reference and prediction lists
        all_references.extend(labels.cpu().numpy())
        all_predictions.extend(predicted_code)

        all_prediction_ids.extend(predicted_ids)
        all_prediction_ids_labelled.extend(predicted_ids.cpu().numpy())

        tokenized_predicted_code = [tokenizer.encode_plus(code, return_tensors='pt', padding='max_length', truncation=True, max_length=512) for code in predicted_code]
        all_predictions_decoded.extend(tokenized_predicted_code)
        labels_predicted = torch.stack([item['input_ids'].squeeze() for item in tokenized_predicted_code])
        # all_predictions_decoded_labelled.extend(labels_predicted.cpu.numpy())
        all_predictions_decoded_labelled.extend(labels_predicted.numpy())


# Save the results to a text file
with open('test_results.txt', 'w') as file:
    for reference, prediction in zip(all_references, all_predictions):
        file.write(f"Reference: {reference}\n")
        file.write(f"Prediction: {prediction}\n\n")


In [None]:
print(all_predictions)

In [None]:
# CodeBLEU Evaluation

# !pip3 install sacrebleu

import sacrebleu

# Check if the lists are not empty
if all_predictions and refactored_codes:
    # Convert NumPy arrays to Python lists of strings
    references = [str(ref) for ref in refactored_codes]
    predictions = [str(pred) for pred in all_predictions]

    # Calculate CodeBLEU
    codebleu = sacrebleu.corpus_bleu(predictions, [references])
    print(f"CodeBLEU: {codebleu.score}")
    print(refactored_codes)
    print(all_predictions)
else:
    print("Error: Empty prediction or reference list.")


In [None]:
# ROGUE1, ROGUE2 and ROGUE-LCS Evaluation

from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store individual ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate over refactored_codes and all_predictions
for ref_code, pred_code in zip(refactored_codes, all_predictions):
    # Calculate ROUGE scores
    scores = scorer.score(ref_code, pred_code)
    
    # Append individual ROUGE scores
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate mean ROUGE scores
mean_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
mean_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
mean_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print mean ROUGE scores
print("Mean ROUGE-1:", mean_rouge1)
print("Mean ROUGE-2:", mean_rouge2)
print("Mean ROUGE-L:", mean_rougeL)


In [None]:
# METEOR Evaluation

import nltk
from nltk.translate import meteor_score

# Download WordNet data
nltk.download('wordnet')

# Check if the lists are not empty
if all_predictions and refactored_codes:
    # Convert NumPy arrays to strings
    hypothesis_strings = str(str(pred) for pred in all_predictions)

    # Preprocess references by converting to strings
    references_strings = []
    for ref in refactored_codes:
        # Convert each tokenized reference to a single string
        ref_string = ' '.join([str(token) for token in ref])
        references_strings.append(ref_string)

    # Calculate METEOR score
    meteor_avg_score = meteor_score.meteor_score(references_strings, hypothesis_strings)
    print(f"METEOR: {meteor_avg_score}")
else:
    print("Error: Empty prediction or reference list.")


In [None]:
# Metrics Visualization

import matplotlib.pyplot as plt

# Define metrics
metrics = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'METEOR']

final_scores = [codebleu.score,  mean_rouge1,  mean_rouge2, mean_rougeL, meteor_avg_score]

# Plotting final scores
plt.bar(metrics, final_scores)

# Add labels and title
plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Final Evaluation Metrics')

# Show plot
plt.show()