In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from accelerate import infer_auto_device_map, dispatch_model
from datasets import load_from_disk
from tqdm import tqdm
import copy


#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
!pip install -U bitsandbytes



In [2]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
adapter_path = "./francesco_lora/checkpoint-675"
# adapter_path = "/content/drive/My Drive/cloning/francesco_lora/checkpoint-200"
offload_dir = "./offload"
# offload_dir = "/content/drive/My Drive/cloning/offload"


tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,                                            # important: don't use "auto" yet
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "20GiB", "cpu": "28GiB"},  # adjust GPU memory to your GPU (e.g., 12, 24 GiB)
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

#base_model_copy = copy.deepcopy(base_model)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# tokenized_test = load_from_disk('/content/drive/MyDrive/cloning/datasets/tokenized_test')
tokenized_test = load_from_disk('./datasets/tokenized_test')

In [None]:
def calculate_perplexity(model, dataset, tokenizer, device="cuda", print_every=30):
    model = model.to(device)
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            num_tokens = (labels != -100).sum().item()

            if num_tokens == 0:
                return float('nan')

            # report metrics only if there is a response
            if num_tokens > 0:
                total_loss += loss.item() * num_tokens
                total_tokens_in_loss += num_tokens

            # Debugging: print prompt and response every 'print_every' batches
            if i % print_every == 0:
                # Decodifica gli input_ids e le labels
                decoded_input = tokenizer.decode(input_ids[0], skip_special_tokens=True)
                # Per le labels, sostituisci i token ignorati (-100) con il token di padding per evitare errori di decodifica
                labels_for_decoding = labels[0].clone()
                labels_for_decoding[labels_for_decoding == -100] = tokenizer.pad_token_id
                decoded_labels = tokenizer.decode(labels_for_decoding, skip_special_tokens=True)

                print(f"\nBatch {i}")
                print(f"Prompt: {decoded_input}")
                print(f"Expected Response: {decoded_labels}")

    avg_loss = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def calculate_perplexity(model, dataloader, tokenizer, max_new_tokens=128):
    model.to(device).eval()
    all_results = []

    for batch_idx, batch in enumerate(dataloader):
        # Move the entire batch onto the device
        input_ids  = batch['input_ids'].to(device)   # shape (B, L)
        labels     = batch['labels'].to(device)      # shape (B, L)

        B = input_ids.size(0)
        for i in range(B):
            ids = input_ids[i : i+1]   # shape (1, L)
            lbl = labels[i : i+1]      # shape (1, L)

            # 1) Find prompt length (labels == -100)
            prompt_len = (lbl == -100).sum().item()

            # 2) Split prompt / ground-truth
            prompt_ids = ids[:, :prompt_len]       # (1, P)
            gt_ids     = ids[:, prompt_len:]       # (1, G)

            # 3) Decode strings
            prompt_txt = tokenizer.decode(prompt_ids.squeeze().tolist(),
                                          skip_special_tokens=True)
            gt_txt     = tokenizer.decode(gt_ids.squeeze().tolist(),
                                          skip_special_tokens=True)

            # 4) Generate from the model (greedy)
            with torch.no_grad():
                gen_full = model.generate(
                    prompt_ids,
                    max_new_tokens=max_new_tokens,
                    pad_token_id=tokenizer.eos_token_id
                )
            gen_ids = gen_full[0, prompt_len:]   # remove prompt prefix
            gen_txt = tokenizer.decode(gen_ids.tolist(),
                                       skip_special_tokens=True)

            # 5) PPL on ground truth
            inp_gt  = torch.cat([prompt_ids, gt_ids], dim=1)
            lbls_gt = torch.full_like(inp_gt, -100)
            lbls_gt[:, prompt_len:] = inp_gt[:, prompt_len:]
            with torch.no_grad():
                loss_gt = model(input_ids=inp_gt, labels=lbls_gt).loss
            ppl_gt = torch.exp(loss_gt).item()

            # 6) PPL on generated
            inp_gen  = torch.cat([prompt_ids, gen_ids.unsqueeze(0)], dim=1)
            lbls_gen = torch.full_like(inp_gen, -100)
            lbls_gen[:, prompt_len:] = inp_gen[:, prompt_len:]
            with torch.no_grad():
                loss_gen = model(input_ids=inp_gen, labels=lbls_gen).loss
            ppl_gen = torch.exp(loss_gen).item()

            # 7) Print & store
            print(f"\n— Example (batch {batch_idx}, idx {i}) —")
            print(f"Prompt:                {prompt_txt}")
            print(f"Ground-truth response: {gt_txt}")
            print(f"Generated response:    {gen_txt}")
            print(f"PPL (ground truth):    {ppl_gt:.2f}")
            print(f"PPL (generated):       {ppl_gen:.2f}")

            all_results.append({
                "prompt": prompt_txt,
                "ground_truth": gt_txt,
                "generated": gen_txt,
                "ppl_ground_truth": ppl_gt,
                "ppl_generated": ppl_gen,
            })

    return all_results

# Usage:
# tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm")
# model     = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm")
# results   = calculate_perplexity(model, test_dataloader, tokenizer)


In [11]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.encodings["labels"][idx], dtype=torch.long)
        }


from torch.utils.data import DataLoader

test_dataloader = CustomDataset(tokenized_test)
test_dataloader = DataLoader(test_dataloader, batch_size=8, shuffle=False)

In [None]:
# pad sequences to same length on the left
def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)



def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text



def print_batch_debug(batch_prompts, responses, ground_truths, tokenizer, N=3):
    """
    Prints the first N examples in the batch, showing:
      - the prompt (without response)
      - the generated response
      - the ground truth response
    """

    # Number of examples to print
    to_print = min(N, len(batch_prompts))
    for idx in range(to_print):
        # 1) decode the prompt (batch_prompts[idx] has no padding at front)
        prompt_ids = batch_prompts[idx].tolist()
        prompt_txt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

        # 2) decode the generated response
        gen_ids = responses[idx].tolist()
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True)

        # 3) grab the ground truth from your precomputed list
        gt_txt = ground_truths[idx]

        # 4) print neatly
        print(f"{'-'*10} Example {idx+1} {'-'*10}")
        print(f"Prompt:\n{prompt_txt}")
        print(f"\nGenerated:    {gen_txt}")
        print(f"Ground Truth: {gt_txt}")
        print()



def generated_sentences_perplexity(model, dataset, tokenizer, device="cuda", print_every=10):
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    for batch_idx, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        ground_truths = []
        for label in labels:
            gt_txt = convert_label_to_string(label, tokenizer)
            ground_truths.append(gt_txt)

        batch_prompts = []
        for i in range(input_ids.size(0)):
            prompt_tokens = input_ids[i][labels[i] == -100]
            batch_prompts.append(prompt_tokens)

        pad_token_id = tokenizer.pad_token_id

        padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)


        # generating responses
        model.eval()
        with torch.no_grad():
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                num_return_sequences=1
            )


        # remove prompt to get only responses
        responses = []
        for gen, prompt in zip(generated, padded_prompts):
            gen_response = gen[len(prompt):]  # Slice off the prompt part
            responses.append(gen_response)

        # decoded_responses = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        #print(decoded_responses)

        input_ids_not_padded = [torch.cat([prompt, response]) for prompt, response in zip(padded_prompts, responses)]
        input_ids = left_pad(input_ids_not_padded, pad_token_id).to(device)

        #decoded_inputs = [tokenizer.decode(input, skip_special_tokens=False) for input in input_ids]
        #print(decoded_inputs)

        attention_mask = (input_ids != pad_token_id).long()

        # re-computing labels
        labels = []
        for whole_input, only_prompt in zip(input_ids, padded_prompts):

            prompt_length = only_prompt.size(0)

            # concatenate -100 tokens long prompt_length and the response
            label = torch.cat([
                torch.full((prompt_length,), -100, dtype=torch.long, device=device),
                whole_input[prompt_length:]                                                             #response tokens
            ])

            labels.append(label)

        labels = left_pad(labels, pad_value=-100).to(device)

        print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=2)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

        # outputs.loss is averaged over non-ignored tokens, to accumulate total log-prob we multiply by number of contributing tokens:
        ntokens = (labels != -100).sum().item()
        total_loss += outputs.loss.item() * ntokens
        total_tokens_in_loss += ntokens

        if (batch_idx + 1) % print_every == 0:
            avg_loss = total_loss / total_tokens_in_loss
            ppl = torch.exp(torch.tensor(avg_loss))
            print(f"  Batch {batch_idx+1:4d}: ppl = {ppl:.2f}")

            # print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=2)

    # 10) final perplexity
    avg_nll = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_nll))
    return perplexity.item()


In [None]:
from torch.nn.functional import log_softmax
from torch.nn.utils.rnn import pad_sequence

def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text



def print_batch_debug(batch_prompts, responses, ground_truths, tokenizer, N=3):
    """
    Prints the first N examples in the batch, showing:
      - the prompt (without response)
      - the generated response
      - the ground truth response
    """

    # Number of examples to print
    to_print = min(N, len(batch_prompts))
    for idx in range(to_print):
        # 1) decode the prompt (batch_prompts[idx] has no padding at front)
        prompt_ids = batch_prompts[idx].tolist()
        prompt_txt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

        # 2) decode the generated response
        gen_ids = responses[idx].tolist()
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True)

        # 3) grab the ground truth from your precomputed list
        gt_txt = ground_truths[idx]

        # 4) print neatly
        print(f"{'-'*10} Example {idx+1} {'-'*10}")
        print(f"Prompt:\n{prompt_txt}")
        print(f"\nGenerated:    {gen_txt}")
        print(f"Ground Truth: {gt_txt}")
        print()



def generated_sentences_perplexity(model, dataloader, tokenizer, device="cuda", print_every=10):
    model = model.to(device)
    model.eval()
    total_nll = 0
    total_tokens = 0

    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Calculating Perplexity")):

        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # ground_truths = []
        # for label in labels:
        #     gt_txt = convert_label_to_string(label, tokenizer)
        #     ground_truths.append(gt_txt)

        prompt_mask = (labels == -100)
        flat_ids    = input_ids.view(-1)                    # (B*L,)
        flat_mask   = prompt_mask.view(-1)                  # (B*L,)
        prompt_tokens_concatenated = flat_ids[flat_mask]
        prompt_lengths = prompt_mask.sum(dim=1).tolist()

        split_prompts = prompt_tokens_concatenated.split(prompt_lengths) 
        pad_token_id = tokenizer.pad_token_id

        reversed_prompts   = [seq.flip(0) for seq in split_prompts]
        padded_reversed    = pad_sequence(reversed_prompts, batch_first=True, padding_value=pad_token_id)
        padded_prompts     = padded_reversed.flip(1)

        # generating responses
        model.eval()
        with torch.no_grad():
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=True,
            )
        
        sequences = generated.sequences             # (B, prompt_length + response_length)
        scores = generated.scores                   # response_length scores of shape (B, vocabulary_size)
        response_length = len(scores)
        prompt_length = padded_prompts.size(1)

        responses = sequences[:, prompt_length:]

        # decoded_responses = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        #print(decoded_responses)

        #input_ids_not_padded = [torch.cat([prompt, response]) for prompt, response in zip(padded_prompts, responses)]
        #input_ids = left_pad(input_ids_not_padded, pad_token_id).to(device)
        #decoded_inputs = [tokenizer.decode(input, skip_special_tokens=False) for input in input_ids]
        #print(decoded_inputs)

        # computing perplexity
        scores_tensor = torch.stack(scores, dim=0)                      # (response_length, B, V)
        log_probs_tensor = torch.log_softmax(scores_tensor, dim=-1)

        # Gather the log‐prob of the actually generated token at each time step:
        #   - first, we need `generated_tokens` as shape (gen_len, B) so we can gather easily:
        responses_T = responses.transpose(0, 1)   # (gen_len, B)

        # For each time step `t`, and each batch index `b`:
        #    token_id = generated_tokens_t[t, b],
        #    its log_prob = log_probs_tensor[t, b, token_id].
        #
        # We can gather with:
        gen_log_probs = torch.gather(
            log_probs_tensor,                   # shape: (gen_len, B, V)
            dim=2,                              # gather out of the V dimension
            index=responses_T.unsqueeze(2)  # shape = (gen_len, B, 1)
        ).squeeze(2)                            # result: (gen_len, B)

        #negative log likelihood
        batch_nll = -gen_log_probs.sum()   # scalar
        batch_tokens = sequences.shape[0] * response_length

        total_nll += batch_nll.item()
        total_tokens += batch_tokens

        #print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=2)

    # Final perplexity:
    avg_nll = total_nll / total_tokens
    return torch.exp(torch.tensor(avg_nll)).item()


In [18]:
p = generated_sentences_perplexity(finetuned_model, test_dataloader, tokenizer)

Calculating Perplexity:   1%|          | 1/137 [02:24<5:27:35, 144.53s/it]

---------- Example 1 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>Ormai il basilico non c'è più. Se vuoi te lo compro iun boccaccino<|turn_end|>
<｜Assistant｜>Come vuoi<|turn_end|>
<｜User｜>Amore mio chiudi il rubinetto del gas e Controlla che la porta sia chiusa<|turn_end|>
<｜Assistant｜>Sii
Il linfonodo è ancora gonfio<|turn_end|>
<｜User｜>Lo faremo controllare<|turn_end|>
<｜Assistant｜>

Generated:    Si
Ground Truth: Ho quasi finito

---------- Example 2 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the U

Calculating Perplexity:   1%|▏         | 2/137 [03:52<4:10:11, 111.20s/it]

---------- Example 1 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜Assistant｜>Awwwwwww
C’è spazio sul divanetto??<|turn_end|>
<｜User｜>Sì vieni?<|turn_end|>
<｜Assistant｜>Ahhhh poi l’altro giorno ho detto ad alessia che ci stiamo sentendo
SIIIIIII CORRO<|turn_end|>
<｜User｜>Ahaha ma perché ti ha chiesto lei qualcosa?<|turn_end|>
<｜Assistant｜>

Generated:    Perché mi ha chiesto di andare a mangiare da alessia
Ground Truth: Nono

---------- Example 2 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a 

Calculating Perplexity:   2%|▏         | 3/137 [04:48<3:12:30, 86.19s/it] 

---------- Example 1 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>Come va il raffredd?<|turn_end|>
<｜Assistant｜>Così così<|turn_end|>
<｜User｜>‎Missed voice call<|turn_end|>
<｜Assistant｜>Sono in pale<|turn_end|>
<｜User｜>Zeppole fatte per la festa del papà/San Giuseppe.Avrei voluto che le assaggiassi
anche tu<|turn_end|>
<｜Assistant｜>

Generated:    Si
Ground Truth: Che carina❤️❤️❤️

---------- Example 2 ----------
Prompt:
You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜Assistant｜>Xke?

Calculating Perplexity:   2%|▏         | 3/137 [05:19<3:57:37, 106.40s/it]


KeyboardInterrupt: 

In [11]:
perplexity = calculate_perplexity(base_model, test_dataloader)
print(f"Perplexity: {perplexity}")

Calculating Perplexity: 100%|██████████| 137/137 [10:49<00:00,  4.74s/it]

Perplexity: 9286.6884765625





In [23]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity: {perplexity}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



— Example (batch 0, idx 0) —
Prompt:                You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>Ormai il basilico non c'è più. Se vuoi te lo compro iun boccaccino<|turn_end|>
<｜Assistant｜>Come vuoi<|turn_end|>
<｜User｜>Amore mio chiudi il rubinetto del gas e Controlla che la porta sia chiusa<|turn_end|>
<｜Assistant｜>Sii
Il linfonodo è ancora gonfio<|turn_end|>
<｜User｜>Lo faremo controllare<|turn_end|>
<｜Assistant｜>
Ground-truth response: Ho quasi finito
Generated response:    Si
PPL (ground truth):    32.98
PPL (generated):       4.74

— Example (batch 0, idx 1) —
Prompt:                You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic 

KeyboardInterrupt: 

In [None]:
# sviluppare la logica per separare da input ids, le labels che rappresentano la risposta reale e il prompt, poi dare il prompt al modello
# mettere tutto su github
# dataset migliorare messaggi vicini nel tempo

In [29]:
from evaluate import load
import numpy as np

def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)


def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text






def evaluate_chat_model(model, tokenizer, dataloader):
    model.eval()
    results = {
        'meteor': [],
        'bertscore_f1': [],
        'perplexity': [],
        'semantic_similarity': []
    }
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):

            # --- Separate Prompt from Response ---
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # getting the prompts only (no responses)
            batch_prompts = []
            for i in range(input_ids.size(0)):
                prompt_tokens = input_ids[i][labels[i] == -100]
                batch_prompts.append(prompt_tokens)

            pad_token_id = tokenizer.pad_token_id
            padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)
            
            # Extract ground truth responses (excluding -100 labels)
            ground_truths = []
            for label in labels:
                gt_txt = convert_label_to_string(label, tokenizer)
                ground_truths.append(gt_txt)

            # --- Generate Responses ---
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=False,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=True,
            )

            sequences = generated.sequences             # (B, prompt_length + response_length)
            prompt_length = padded_prompts.size(1)
            responses_tokens = sequences[:, prompt_length:]
            pred_responses = tokenizer.batch_decode(responses_tokens, skip_special_tokens=True)


            
            # --- Calculate Metrics ---
            # METEOR
            meteor_results = meteor.compute(predictions=pred_responses, references=ground_truths)
            results['meteor'].append(meteor_results['meteor'])
            
            # BERTScore
            bert_results = bertscore.compute(
                predictions=pred_responses, 
                references=ground_truths, 
                lang='it',
                model_type='roberta-large'
            )
            results['bertscore_f1'].extend(bert_results['f1'])
            
            # Semantic Similarity
            pred_embeddings = semantic_model.encode(pred_responses)
            gt_embeddings = semantic_model.encode(ground_truths)
            similarities = np.diag(np.inner(pred_embeddings, gt_embeddings))
            results['semantic_similarity'].extend(similarities)
            
            # --- Perplexity (Response-Only) ---
            # Get logits for response tokens
            #outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            # Mask out prompt tokens
            # loss_mask = (labels != -100).float()
            # per_token_loss = torch.nn.functional.cross_entropy(
            #     outputs.logits.view(-1, outputs.logits.size(-1)),
            #     labels.view(-1),
            #     reduction='none'
            # ).view_as(labels)
            
            # valid_loss = (per_token_loss * loss_mask).sum() / loss_mask.sum()
            # results['perplexity'].append(torch.exp(valid_loss).item())
    
    # Aggregate results
    return {
        'meteor': np.mean(results['meteor']),
        'bertscore_f1': np.mean(results['bertscore_f1']),
        'semantic_similarity': np.mean(results['semantic_similarity']),
        # 'perplexity': np.mean(results['perplexity']),
        'predictions': pred_responses,
        'references': ground_truths
    }

In [None]:
# Load your model and tokenizer first
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
Evaluation Results:
- METEOR: {results['meteor']:.3f} (0-1, higher=better)
- BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
- Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
- Perplexity: {results['perplexity']:.1f} (lower=better)
""")