In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from accelerate import infer_auto_device_map, dispatch_model
from datasets import load_from_disk
from tqdm import tqdm
import copy


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U bitsandbytes

In [2]:
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')

# Now you can use hf_token to log in:
from huggingface_hub import login
login(token=hf_token)

In [3]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
#adapter_path = "./francesco_lora/checkpoint-675"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint-675"
# offload_dir = "./offload"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"


tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,                                            # important: don't use "auto" yet
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "20GiB", "cpu": "28GiB"},  # adjust GPU memory to your GPU (e.g., 12, 24 GiB)
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

#base_model_copy = copy.deepcopy(base_model)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
tokenized_test = load_from_disk('/content/drive/MyDrive/Digital-Self-Replica/datasets/tokenized_test')
# tokenized_test = load_from_disk('./datasets/tokenized_test')

In [None]:
def calculate_perplexity(model, dataset, tokenizer, device="cuda"):
    model = model.to(device)
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            num_tokens = (labels != -100).sum().item()

            if num_tokens == 0:
                return float('nan')

            # report metrics only if there is a response
            if num_tokens > 0:
                total_loss += loss.item() * num_tokens
                total_tokens_in_loss += num_tokens

    avg_loss = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

In [5]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.encodings["labels"][idx], dtype=torch.long)
        }


from torch.utils.data import DataLoader

test_dataloader = CustomDataset(tokenized_test)
test_dataloader = DataLoader(test_dataloader, batch_size=16, shuffle=False)

In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [12:08<00:00, 10.56s/it]

Perplexity: 27.985580444335938





In [None]:
perplexity = calculate_perplexity(base_model, test_dataloader, tokenizer)
print(f"Perplexity: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [11:16<00:00,  9.81s/it]

Perplexity: 9286.6884765625





In [None]:
# pad sequences to same length on the left
def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)



def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text



def print_batch_debug(batch_prompts, responses, ground_truths, tokenizer, N=3):
    """
    Prints the first N examples in the batch, showing:
      - the prompt (without response)
      - the generated response
      - the ground truth response
    """

    # Number of examples to print
    to_print = min(N, len(batch_prompts))
    for idx in range(to_print):
        # 1) decode the prompt (batch_prompts[idx] has no padding at front)
        prompt_ids = batch_prompts[idx].tolist()
        prompt_txt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

        # 2) decode the generated response
        gen_ids = responses[idx].tolist()
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True)

        # 3) grab the ground truth from your precomputed list
        gt_txt = ground_truths[idx]

        # 4) print neatly
        print(f"{'-'*10} Example {idx+1} {'-'*10}")
        print(f"Prompt:\n{prompt_txt}")
        print(f"\nGenerated:    {gen_txt}")
        print(f"Ground Truth: {gt_txt}")
        print()



def generated_sentences_perplexity(model, dataset, tokenizer, device="cuda", print_every=10):
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    for batch_idx, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # ground_truths = []
        # for label in labels:
        #     gt_txt = convert_label_to_string(label, tokenizer)
        #     ground_truths.append(gt_txt)

        batch_prompts = []
        for i in range(input_ids.size(0)):
            prompt_tokens = input_ids[i][labels[i] == -100]
            batch_prompts.append(prompt_tokens)

        pad_token_id = tokenizer.pad_token_id

        padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)


        # generating responses
        model.eval()
        with torch.no_grad():
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                num_return_sequences=1
            )


        # remove prompt to get only responses
        responses = []
        for gen, prompt in zip(generated, padded_prompts):
            gen_response = gen[len(prompt):]  # Slice off the prompt part
            responses.append(gen_response)

        # decoded_responses = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        #print(decoded_responses)
        # input_ids_not_padded = [torch.cat([prompt, response]) for prompt, response in zip(padded_prompts, responses)]
        # input_ids = left_pad(input_ids_not_padded, pad_token_id).to(device)

        #decoded_inputs = [tokenizer.decode(input, skip_special_tokens=False) for input in input_ids]
        #print(decoded_inputs)

        attention_mask = (input_ids != pad_token_id).long()

        # re-computing labels
        labels = []
        for whole_input, only_prompt in zip(input_ids, padded_prompts):

            prompt_length = only_prompt.size(0)

            # concatenate -100 tokens long prompt_length and the response
            label = torch.cat([
                torch.full((prompt_length,), -100, dtype=torch.long, device=device),
                whole_input[prompt_length:]                                                             #response tokens
            ])

            labels.append(label)

        labels = left_pad(labels, pad_value=-100).to(device)

        # print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=2)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

        # outputs.loss is averaged over non-ignored tokens, to accumulate total log-prob we multiply by number of contributing tokens:
        ntokens = (labels != -100).sum().item()
        total_loss += outputs.loss.item() * ntokens
        total_tokens_in_loss += ntokens

        if (batch_idx + 1) % print_every == 0:
            avg_loss = total_loss / total_tokens_in_loss
            ppl = torch.exp(torch.tensor(avg_loss))
            print(f"  Batch {batch_idx+1:4d}: ppl = {ppl:.2f}")

            # print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=2)

    # 10) final perplexity
    avg_nll = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_nll))
    return perplexity.item()


In [None]:
p = generated_sentences_perplexity(finetuned_model, test_dataloader, tokenizer)
p

Calculating Perplexity:  14%|█▍        | 10/69 [03:17<20:47, 21.14s/it]

  Batch   10: ppl = 7.09


Calculating Perplexity:  29%|██▉       | 20/69 [06:44<16:09, 19.79s/it]

  Batch   20: ppl = 9.11


Calculating Perplexity:  43%|████▎     | 30/69 [09:56<12:10, 18.73s/it]

  Batch   30: ppl = 9.59


Calculating Perplexity:  58%|█████▊    | 40/69 [13:16<10:22, 21.46s/it]

  Batch   40: ppl = 9.70


Calculating Perplexity:  72%|███████▏  | 50/69 [16:28<06:17, 19.85s/it]

  Batch   50: ppl = 8.97


Calculating Perplexity:  87%|████████▋ | 60/69 [19:46<02:51, 19.03s/it]

  Batch   60: ppl = 8.76


Calculating Perplexity: 100%|██████████| 69/69 [22:15<00:00, 19.35s/it]


8.51552963256836

In [None]:
perplexity = calculate_perplexity(base_model, test_dataloader)
print(f"Perplexity: {perplexity}")

Calculating Perplexity: 100%|██████████| 137/137 [10:49<00:00,  4.74s/it]

Perplexity: 9286.6884765625





In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity: {perplexity}")

Calculating Perplexity:   0%|          | 0/69 [00:09<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.32 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.61 GiB is free. Process 27695 has 13.12 GiB memory in use. Of the allocated memory 9.70 GiB is allocated by PyTorch, and 3.29 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# sviluppare la logica per separare da input ids, le labels che rappresentano la risposta reale e il prompt, poi dare il prompt al modello
# mettere tutto su github
# dataset migliorare messaggi vicini nel tempo

In [None]:
!pip install evaluate
!pip install sentence_transformers
!pip install bert_score

In [10]:
from evaluate import load
import numpy as np
from sentence_transformers import SentenceTransformer, util

def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)


def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text





def evaluate_chat_model(model, tokenizer, dataloader):
    model.eval()
    results = {
        'meteor': [],
        'bertscore_f1': [],
        'semantic_similarity': [],
        'prompt_ground_truth_similarity': [],
        'prompt_pred_similarity': []
    }

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    bertscore = load("bertscore")
    meteor = load("meteor")

    #semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    semantic_model = SentenceTransformer(
        'nickprock/sentence-bert-base-italian-uncased',
        device='cpu'
    )


    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):

            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # getting the prompts only (no responses)
            batch_prompts = []
            for i in range(input_ids.size(0)):
                prompt_tokens = input_ids[i][labels[i] == -100]
                batch_prompts.append(prompt_tokens)

            pad_token_id = tokenizer.pad_token_id
            padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)

            # Extract ground truth responses (excluding -100 labels)
            ground_truths = []
            for label in labels:
                gt_txt = convert_label_to_string(label, tokenizer)
                ground_truths.append(gt_txt)

            # --- Generate Responses ---
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=True,
            )

            sequences = generated.sequences             # (B, prompt_length + response_length)
            prompt_length = padded_prompts.size(1)
            responses_tokens = sequences[:, prompt_length:]
            pred_responses = tokenizer.batch_decode(responses_tokens, skip_special_tokens=True)



            # --- Calculate Metrics ---
            # METEOR
            meteor_results = meteor.compute(predictions=pred_responses, references=ground_truths)
            results['meteor'].append(meteor_results['meteor'])

            # BERTScore
            bert_results = bertscore.compute(
                predictions=pred_responses,
                references=ground_truths,
                lang='it',
                model_type='dbmdz/bert-base-italian-xxl-cased',
                num_layers=12,
                device = 'cpu',
                verbose = True,
            )
            results['bertscore_f1'].extend(bert_results['f1'])

            # Semantic Similarity
            prompt_texts = tokenizer.batch_decode(padded_prompts, skip_special_tokens=True)

            pred_embeddings = semantic_model.encode(
                pred_responses,
                convert_to_tensor=True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            gt_embeddings = semantic_model.encode(
                ground_truths,
                convert_to_tensor = True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            prompt_embeddings = semantic_model.encode(
                prompt_texts,
                convert_to_tensor = True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            similarities = np.diag(np.inner(pred_embeddings, gt_embeddings))
            results['semantic_similarity'].extend(util.cos_sim(pred_embeddings, gt_embeddings).diag().cpu().numpy())
            results['prompt_pred_similarity'].extend(util.cos_sim(prompt_embeddings, pred_embeddings).diag().cpu().numpy())
            results['prompt_ground_truth_similarity'].extend(util.cos_sim(prompt_embeddings, gt_embeddings).diag().cpu().numpy())



    # Aggregate results
    return {
        'meteor': np.mean(results['meteor']),
        'bertscore_f1': np.mean(results['bertscore_f1']),
        'semantic_similarity': np.mean(results['semantic_similarity']),
        'prompt_alignment': {
            'predicted': np.mean(results['prompt_pred_similarity']),
            'ground_truth': np.mean(results['prompt_ground_truth_similarity']),
            'ratio': (np.mean(results['prompt_pred_similarity']) /
                     np.mean(results['prompt_ground_truth_similarity']))
        },
        'predictions': pred_responses,
        'references': ground_truths
    }

In [11]:
results = evaluate_chat_model(base_model, tokenizer, test_dataloader)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyboardInterrupt: 

In [13]:
print(f"""
Evaluation Results:
- METEOR: {results['meteor']:.3f} (0-1, higher=better)
- BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
- Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
Prompt Alignment:
- Model Responses: {results['prompt_alignment']['predicted']:.3f}
- Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
- Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

#- Perplexity: {results['perplexity']:.1f} (lower=better)


Evaluation Results:
- METEOR: 0.012 (0-1, higher=better)
- BERTScore F1: 0.275 (0-1, higher=better)
- Semantic Similarity: 0.221 (0-1 cosine)
Prompt Alignment:
- Model Responses: 0.636
- Ground Truth: 0.245
- Alignment Ratio: 260.0%



In [12]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Evaluating:   0%|          | 0/69 [00:00<?, ?it/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11430.84 seconds, 0.00 sentences/sec


  similarities = np.diag(np.inner(pred_embeddings, gt_embeddings))
Evaluating:   1%|▏         | 1/69 [00:30<34:25, 30.37s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11451.81 seconds, 0.00 sentences/sec


Evaluating:   3%|▎         | 2/69 [00:50<26:58, 24.16s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11471.88 seconds, 0.00 sentences/sec


Evaluating:   4%|▍         | 3/69 [01:09<24:18, 22.10s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11491.07 seconds, 0.00 sentences/sec


Evaluating:   6%|▌         | 4/69 [01:30<23:25, 21.62s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11512.00 seconds, 0.00 sentences/sec


Evaluating:   7%|▋         | 5/69 [01:50<22:17, 20.91s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11531.68 seconds, 0.00 sentences/sec


Evaluating:   9%|▊         | 6/69 [02:10<21:46, 20.73s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11553.77 seconds, 0.00 sentences/sec


Evaluating:  10%|█         | 7/69 [02:33<22:04, 21.36s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11577.20 seconds, 0.00 sentences/sec


Evaluating:  12%|█▏        | 8/69 [02:57<22:39, 22.28s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11607.22 seconds, 0.00 sentences/sec


Evaluating:  13%|█▎        | 9/69 [03:26<24:11, 24.20s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11627.69 seconds, 0.00 sentences/sec


Evaluating:  14%|█▍        | 10/69 [03:46<22:38, 23.03s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11647.14 seconds, 0.00 sentences/sec


Evaluating:  16%|█▌        | 11/69 [04:05<21:05, 21.81s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11675.91 seconds, 0.00 sentences/sec


Evaluating:  17%|█▋        | 12/69 [04:36<23:13, 24.45s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11707.13 seconds, 0.00 sentences/sec


Evaluating:  19%|█▉        | 13/69 [05:08<25:08, 26.94s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11737.99 seconds, 0.00 sentences/sec


Evaluating:  20%|██        | 14/69 [05:38<25:24, 27.72s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11760.54 seconds, 0.00 sentences/sec


Evaluating:  22%|██▏       | 15/69 [05:58<23:03, 25.62s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11788.87 seconds, 0.00 sentences/sec


Evaluating:  23%|██▎       | 16/69 [06:29<23:49, 26.97s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11809.26 seconds, 0.00 sentences/sec


Evaluating:  25%|██▍       | 17/69 [06:47<21:01, 24.27s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11827.75 seconds, 0.00 sentences/sec


Evaluating:  26%|██▌       | 18/69 [07:06<19:26, 22.87s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11847.83 seconds, 0.00 sentences/sec


Evaluating:  28%|██▊       | 19/69 [07:25<18:10, 21.81s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11871.32 seconds, 0.00 sentences/sec


Evaluating:  29%|██▉       | 20/69 [07:52<18:56, 23.20s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11893.33 seconds, 0.00 sentences/sec


Evaluating:  30%|███       | 21/69 [08:11<17:33, 21.95s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11916.50 seconds, 0.00 sentences/sec


Evaluating:  32%|███▏      | 22/69 [08:35<17:38, 22.51s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11941.78 seconds, 0.00 sentences/sec


Evaluating:  33%|███▎      | 23/69 [09:00<17:50, 23.27s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 11970.75 seconds, 0.00 sentences/sec


Evaluating:  35%|███▍      | 24/69 [09:33<19:38, 26.19s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12003.13 seconds, 0.00 sentences/sec


Evaluating:  36%|███▌      | 25/69 [10:04<20:16, 27.65s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12024.40 seconds, 0.00 sentences/sec


Evaluating:  38%|███▊      | 26/69 [10:23<17:55, 25.01s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12053.84 seconds, 0.00 sentences/sec


Evaluating:  39%|███▉      | 27/69 [10:53<18:37, 26.61s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12075.04 seconds, 0.00 sentences/sec


Evaluating:  41%|████      | 28/69 [11:12<16:41, 24.43s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12095.67 seconds, 0.00 sentences/sec


Evaluating:  42%|████▏     | 29/69 [11:33<15:35, 23.38s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12115.63 seconds, 0.00 sentences/sec


Evaluating:  43%|████▎     | 30/69 [11:53<14:26, 22.21s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12134.40 seconds, 0.00 sentences/sec


Evaluating:  45%|████▍     | 31/69 [12:13<13:36, 21.48s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12155.20 seconds, 0.00 sentences/sec


Evaluating:  46%|████▋     | 32/69 [12:33<12:59, 21.07s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12174.89 seconds, 0.00 sentences/sec


Evaluating:  48%|████▊     | 33/69 [12:55<12:46, 21.29s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12204.39 seconds, 0.00 sentences/sec


Evaluating:  49%|████▉     | 34/69 [13:24<13:48, 23.67s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12233.56 seconds, 0.00 sentences/sec


Evaluating:  51%|█████     | 35/69 [13:53<14:23, 25.41s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12263.65 seconds, 0.00 sentences/sec


Evaluating:  52%|█████▏    | 36/69 [14:23<14:40, 26.68s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12293.27 seconds, 0.00 sentences/sec


Evaluating:  54%|█████▎    | 37/69 [14:51<14:31, 27.23s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12313.67 seconds, 0.00 sentences/sec


Evaluating:  55%|█████▌    | 38/69 [15:13<13:10, 25.49s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12333.39 seconds, 0.00 sentences/sec


Evaluating:  57%|█████▋    | 39/69 [15:32<11:48, 23.61s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12353.79 seconds, 0.00 sentences/sec


Evaluating:  58%|█████▊    | 40/69 [15:53<11:00, 22.77s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12374.02 seconds, 0.00 sentences/sec


Evaluating:  59%|█████▉    | 41/69 [16:11<10:02, 21.51s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12398.12 seconds, 0.00 sentences/sec


Evaluating:  61%|██████    | 42/69 [16:36<10:06, 22.45s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12417.45 seconds, 0.00 sentences/sec


Evaluating:  62%|██████▏   | 43/69 [16:55<09:18, 21.46s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12437.59 seconds, 0.00 sentences/sec


Evaluating:  64%|██████▍   | 44/69 [17:16<08:54, 21.37s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12457.19 seconds, 0.00 sentences/sec


Evaluating:  65%|██████▌   | 45/69 [17:35<08:13, 20.57s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12482.74 seconds, 0.00 sentences/sec


Evaluating:  67%|██████▋   | 46/69 [18:01<08:30, 22.21s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12512.10 seconds, 0.00 sentences/sec


Evaluating:  68%|██████▊   | 47/69 [18:31<08:58, 24.49s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12541.03 seconds, 0.00 sentences/sec


Evaluating:  70%|██████▉   | 48/69 [19:01<09:12, 26.30s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12571.30 seconds, 0.00 sentences/sec


Evaluating:  71%|███████   | 49/69 [19:31<09:05, 27.26s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12594.18 seconds, 0.00 sentences/sec


Evaluating:  72%|███████▏  | 50/69 [19:53<08:07, 25.68s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12623.13 seconds, 0.00 sentences/sec


Evaluating:  74%|███████▍  | 51/69 [20:23<08:06, 27.04s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12643.83 seconds, 0.00 sentences/sec


Evaluating:  75%|███████▌  | 52/69 [20:42<06:57, 24.58s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12664.71 seconds, 0.00 sentences/sec


Evaluating:  77%|███████▋  | 53/69 [21:04<06:19, 23.72s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12685.04 seconds, 0.00 sentences/sec


Evaluating:  78%|███████▊  | 54/69 [21:22<05:32, 22.14s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12712.95 seconds, 0.00 sentences/sec


Evaluating:  80%|███████▉  | 55/69 [21:54<05:51, 25.08s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12735.66 seconds, 0.00 sentences/sec


Evaluating:  81%|████████  | 56/69 [22:14<05:04, 23.42s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12754.89 seconds, 0.00 sentences/sec


Evaluating:  83%|████████▎ | 57/69 [22:33<04:25, 22.13s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12783.37 seconds, 0.00 sentences/sec


Evaluating:  84%|████████▍ | 58/69 [23:04<04:33, 24.88s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12813.16 seconds, 0.00 sentences/sec


Evaluating:  86%|████████▌ | 59/69 [23:34<04:24, 26.46s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12844.15 seconds, 0.00 sentences/sec


Evaluating:  87%|████████▋ | 60/69 [24:05<04:08, 27.63s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12866.83 seconds, 0.00 sentences/sec


Evaluating:  88%|████████▊ | 61/69 [24:26<03:26, 25.83s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12886.79 seconds, 0.00 sentences/sec


Evaluating:  90%|████████▉ | 62/69 [24:44<02:44, 23.46s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12905.78 seconds, 0.00 sentences/sec


Evaluating:  91%|█████████▏| 63/69 [25:05<02:15, 22.62s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12925.69 seconds, 0.00 sentences/sec


Evaluating:  93%|█████████▎| 64/69 [25:23<01:47, 21.43s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12946.78 seconds, 0.00 sentences/sec


Evaluating:  94%|█████████▍| 65/69 [25:46<01:27, 21.83s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 12976.00 seconds, 0.00 sentences/sec


Evaluating:  96%|█████████▌| 66/69 [26:17<01:13, 24.41s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 13008.09 seconds, 0.00 sentences/sec


Evaluating:  97%|█████████▋| 67/69 [26:46<00:51, 25.97s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 13033.13 seconds, 0.00 sentences/sec


Evaluating:  99%|█████████▊| 68/69 [27:10<00:25, 25.36s/it]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 13054.91 seconds, 0.00 sentences/sec


Evaluating: 100%|██████████| 69/69 [27:14<00:00, 23.69s/it]


In [13]:
print(f"""
Evaluation Results:
- METEOR: {results['meteor']:.3f} (0-1, higher=better)
- BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
- Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
Prompt Alignment:
- Model Responses: {results['prompt_alignment']['predicted']:.3f}
- Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
- Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")


Evaluation Results:
- METEOR: 0.031 (0-1, higher=better)
- BERTScore F1: 0.418 (0-1, higher=better)
- Semantic Similarity: 0.288 (0-1 cosine)
Prompt Alignment:
- Model Responses: 0.250
- Ground Truth: 0.245
- Alignment Ratio: 102.1%



In [None]:
print(f"""
Evaluation Results:
- METEOR: {results['meteor']:.3f} (0-1, higher=better)
- BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
- Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
Prompt Alignment:
- Model Responses: {results['prompt_alignment']['predicted']:.3f}
- Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
- Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

#- Perplexity: {results['perplexity']:.1f} (lower=better)


Evaluation Results:
- METEOR: 0.012 (0-1, higher=better)
- BERTScore F1: 0.275 (0-1, higher=better)
- Semantic Similarity: 0.221 (0-1 cosine)
Prompt Alignment:
- Model Responses: 0.636
- Ground Truth: 0.245
- Alignment Ratio: 260.0%

