In [7]:
import torch
from rouge_score import rouge_scorer
from tqdm import tqdm
import numpy as np
from utils import OnlineDistilledDataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, ConcatDataset
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")


In [8]:
eval_dataset = OnlineDistilledDataset(split="validation")
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=16, num_workers=8,
                                         pin_memory=True)

Setting TOKENIZERS_PARALLELISM=false for forked processes.
Map (num_proc=64): 100%|██████████| 1000/1000 [00:02<00:00, 439.56 examples/s]


In [9]:
from mo_distill_utils import load_teacher_model

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-220m")

teacher_model = load_teacher_model("teacher_model/", device)
teacher_model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
def evaluate(model, device, eval_dataloader):
    model.eval()
    predict_all = []
    labels_all = []
    target_all = []
    with torch.no_grad():
        bar = tqdm(eval_dataloader, total=len(eval_dataloader))
        bar.set_description("Evaluation")
        for batch in bar:
            texts = batch[0].to(device)
            label = batch[1].to(device)
            target = list(batch[4])
            pred = model.generate(texts)
            pred = tokenizer.batch_decode(pred, skip_special_tokens=True)
            
            
            predict_all += pred
            target_all += target
            labels_all.append(label.cpu().numpy())

    # Compute ROUGE-L scores
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    rouge_scores = []
    for pred, target in zip(predict_all, target_all):
        score = scorer.score(target, pred)
        rouge_scores.append(score['rougeL'].fmeasure)

    # Calculate average ROUGE-L score
    avg_rouge_l = np.mean(rouge_scores)
    print(f"Average ROUGE-L: {avg_rouge_l:.4f}")

In [27]:
evaluate(teacher_model, device, eval_dataloader)

Evaluation: 100%|██████████| 63/63 [01:54<00:00,  1.82s/it]
06/16/2025 19:51:20 - INFO - absl - Using default tokenizer.


Average ROUGE-L: 0.3378
