In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from accelerate import infer_auto_device_map, dispatch_model
from datasets import load_from_disk, load_metric
from tqdm import tqdm
import copy
from torch.utils.data import Dataset, DataLoader
from evaluate import load
import numpy as np
from sentence_transformers import SentenceTransformer, util


from google.colab import drive, userdata
drive.mount('/content/drive')

In [None]:
# !pip install evaluate
# !pip install sentence_transformers
# !pip install bert_score
# !pip install -U bitsandbytes

In [None]:
hf_token = userdata.get('HF_TOKEN')

# Now you can use hf_token to log in:
from huggingface_hub import login
login(token=hf_token)

### Loading model and data

In [None]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint_600"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "15GiB", "cpu": "28GiB"},
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint_600"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "15GiB", "cpu": "28GiB"},
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint_600"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "15GiB", "cpu": "28GiB"},
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint_600"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "15GiB", "cpu": "28GiB"},
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
class ChatDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.encodings["labels"][idx], dtype=torch.long)
        }

tokenized_test = load_from_disk('/content/drive/MyDrive/Digital-Self-Replica/datasets/tokenized_test')
#tokenized_test = load_from_disk('./datasets/tokenized_test')

test_dataloader = ChatDataset(tokenized_test)
test_dataloader = DataLoader(test_dataloader, batch_size=16, shuffle=False)

### Perplexity

In [None]:
def calculate_perplexity(model, dataset, tokenizer, device="cuda"):
    model = model.to(device)
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            num_tokens = (labels != -100).sum().item()

            if num_tokens == 0:
                continue

            # report metrics only if there is a response
            if num_tokens > 0:
                total_loss += loss.item() * num_tokens
                total_tokens_in_loss += num_tokens

    avg_loss = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_loss, device=device))
    return perplexity.item()

In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Model with rank = 16: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [12:14<00:00, 10.65s/it]

Perplexity of Fine-Tuned Model with rank = 16: 28.44940185546875





In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Model with rank = 32: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [12:09<00:00, 10.58s/it]

Perplexity of Fine-Tuned Model with rank = 32: 27.985580444335938





In [None]:
perplexity = calculate_perplexity(base_model, test_dataloader, tokenizer)
print(f"Perplexity of Base Model: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [11:06<00:00,  9.67s/it]

Perplexity of Base Model: 9286.6884765625





In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Model with rank = 64 and alpha = 128: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [11:56<00:00, 10.38s/it]

Perplexity of Fine-Tuned Model with rank = 64: 31.331012725830078





In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Model with rank = 64 and alpha = 64: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [12:04<00:00, 10.50s/it]

Perplexity of Fine-Tuned Model with rank = 64 and alpha = 64: 31.680635452270508





#### Best Model

In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Final Model with rank = 64 and alpha = 32: {perplexity}")

Calculating Perplexity: 100%|██████████| 69/69 [12:33<00:00, 10.92s/it]

Perplexity of Fine-Tuned Final Model with rank = 64 and alpha = 32: 17.99485206604004





### Generation analysis

In [None]:
# pad sequences to same length on the left
def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)



def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text



# function to print N prompts, responses and ground truths, given a batch of prompts, responses, and ground truths
def print_batch_debug(batch_prompts, responses, ground_truths, tokenizer, N=3):

    # Number of examples to print
    to_print = min(N, len(batch_prompts))
    for idx in range(to_print):
        # decode prompt
        prompt_ids = batch_prompts[idx].tolist()
        prompt_txt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

        # decode predictions
        gen_ids = responses[idx].tolist()
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True)

        # ground truths
        gt_txt = ground_truths[idx]

        print(f"{'-'*10} Example {idx+1} {'-'*10}")
        print(f"Prompt:\n{prompt_txt}")
        print(f"\nGenerated:    {gen_txt}")
        print(f"Ground Truth: {gt_txt}")
        print()


# function to print prompts, responses, and ground truths
def analyze_generation(model, dataset, tokenizer, device="cuda", print_every=10):
    model.eval()

    for batch_idx, batch in enumerate(tqdm(dataset, desc="Generation: ")):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        ground_truths = []
        for label in labels:
            gt_txt = convert_label_to_string(label, tokenizer)
            ground_truths.append(gt_txt)

        batch_prompts = []
        for i in range(input_ids.size(0)):
            prompt_tokens = input_ids[i][labels[i] == -100]
            batch_prompts.append(prompt_tokens)

        pad_token_id = tokenizer.pad_token_id
        padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)


        # generating responses
        model.eval()
        with torch.no_grad():
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=90,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                num_return_sequences=1
            )


        # remove prompt to get only responses
        responses = []
        for gen, prompt in zip(generated, padded_prompts):
            gen_response = gen[len(prompt):]  # Slice off the prompt part
            responses.append(gen_response)

        # decoded_responses = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        #print(decoded_responses)
        # input_ids_not_padded = [torch.cat([prompt, response]) for prompt, response in zip(padded_prompts, responses)]
        # input_ids = left_pad(input_ids_not_padded, pad_token_id).to(device)

        #decoded_inputs = [tokenizer.decode(input, skip_special_tokens=False) for input in input_ids]
        #print(decoded_inputs)

        attention_mask = (input_ids != pad_token_id).long()


        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )


        if (batch_idx + 1) % print_every == 0:
            print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=3)


### Actual Evaluation

In [None]:
def evaluate_chat_model(model, tokenizer, dataloader, device=None):
    """
    First we loop for generating the predictions
    After that, we measure everything in one shot to speed up process.
    """

    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    model.eval()

    meteor = load("meteor")
    bertscore = load("bertscore")


    semantic_model = SentenceTransformer(
        'nickprock/sentence-bert-base-italian-uncased',
        device='cpu'    # cuda already full
    )

    all_prompts:    list[str] = []
    all_preds:      list[str] = []
    all_references: list[str] = []

    pad_token_id = tokenizer.pad_token_id

    # generation loop: collecting predictions
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating & Collecting"):
            input_ids = batch['input_ids'].to(device)            # (B, seq_len)
            labels    = batch['labels'].to(device)               # (B, seq_len)

            batch_prompts_ids = []
            for i in range(input_ids.size(0)):
                mask = labels[i] == -100
                prompt_ids = input_ids[i][mask]
                batch_prompts_ids.append(prompt_ids)

            padded_prompts = left_pad(batch_prompts_ids, pad_token_id).to(device)

            prompts_text = tokenizer.batch_decode(
                padded_prompts, skip_special_tokens=True
            )
            all_prompts.extend(prompts_text)


            ground_truths = [
                convert_label_to_string(label, tokenizer)
                for label in labels
            ]
            all_references.extend(ground_truths)

            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=90,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=False
            )

            sequences = generated.sequences
            gen_tokens = sequences[:, padded_prompts.size(1):]    # only response part
            decoded_preds = tokenizer.batch_decode(
                gen_tokens, skip_special_tokens=True
            )
            all_preds.extend(decoded_preds)

    # lists built

    # meteor score
    meteor_res = meteor.compute(
        predictions=all_preds,
        references=all_references
    )
    avg_meteor = meteor_res["meteor"]

    # BERTScore
    bert_res = bertscore.compute(
        predictions=all_preds,
        references=all_references,
        lang="it",
        model_type="dbmdz/bert-base-italian-xxl-cased",
        num_layers=12,
        device="cpu",
        batch_size=16,
        rescale_with_baseline=False
    )
    avg_bertscore_f1 = float(np.mean(bert_res["f1"]))

    # Semantic Similarity
    prompt_embeds = semantic_model.encode(
        all_prompts,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,
        show_progress_bar=True
    )
    pred_embeds = semantic_model.encode(
        all_preds,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,
        show_progress_bar=True
    )
    ref_embeds = semantic_model.encode(
        all_references,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,
        show_progress_bar=True
    )

    sim_pred_ref = util.cos_sim(pred_embeds, ref_embeds).diagonal().cpu().numpy()
    sim_prompt_pred = util.cos_sim(prompt_embeds, pred_embeds).diagonal().cpu().numpy()
    sim_prompt_ref  = util.cos_sim(prompt_embeds, ref_embeds).diagonal().cpu().numpy()

    avg_semantic_similarity = float(np.mean(sim_pred_ref))
    avg_prompt_pred       = float(np.mean(sim_prompt_pred))
    avg_prompt_ref        = float(np.mean(sim_prompt_ref))
    prompt_alignment_ratio = avg_prompt_pred / (avg_prompt_ref + 1e-12)

    return {
        "meteor": avg_meteor,
        "bertscore_f1": avg_bertscore_f1,
        "semantic_similarity": avg_semantic_similarity,
        "prompt_alignment": {
            "predicted": avg_prompt_pred,
            "ground_truth": avg_prompt_ref,
            "ratio": prompt_alignment_ratio
        },
        "predictions": all_preds,
        "references": all_references
    }


In [None]:
results = evaluate_chat_model(base_model, tokenizer, test_dataloader)

print(f"""
    Baseline Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Generating & Collecting: 100%|██████████| 69/69 [21:14<00:00, 18.47s/it]


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]


    Baseline Evaluation Results:
    - METEOR: 0.011 (0-1, higher=better)
    - BERTScore F1: 0.275 (0-1, higher=better)
    - Semantic Similarity: 0.220 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.638
    - Ground Truth: 0.245
    - Alignment Ratio: 260.8%



In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model with r=32 Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Generating & Collecting: 100%|██████████| 69/69 [17:26<00:00, 15.17s/it]


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]


    Finetuned model with r=32 Evaluation Results:
    - METEOR: 0.027 (0-1, higher=better)
    - BERTScore F1: 0.409 (0-1, higher=better)
    - Semantic Similarity: 0.281 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.251
    - Ground Truth: 0.245
    - Alignment Ratio: 102.7%



In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model with r=16 Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Generating & Collecting: 100%|██████████| 69/69 [16:16<00:00, 14.16s/it]


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]


    Finetuned model with r=16 Evaluation Results:
    - METEOR: 0.028 (0-1, higher=better)
    - BERTScore F1: 0.421 (0-1, higher=better)
    - Semantic Similarity: 0.278 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.249
    - Ground Truth: 0.245
    - Alignment Ratio: 102.0%



In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model with r=64 alpha=128 Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/118 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/732k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating & Collecting: 100%|██████████| 69/69 [20:53<00:00, 18.17s/it]


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]


    Finetuned model with r=64 Evaluation Results:
    - METEOR: 0.029 (0-1, higher=better)
    - BERTScore F1: 0.400 (0-1, higher=better)
    - Semantic Similarity: 0.271 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.254
    - Ground Truth: 0.245
    - Alignment Ratio: 103.9%



In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model with r=64 and alpha=64 Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Generating & Collecting: 100%|██████████| 69/69 [14:37<00:00, 12.72s/it]


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]


    Finetuned model with r=64 and alpha=64 Evaluation Results:
    - METEOR: 0.024 (0-1, higher=better)
    - BERTScore F1: 0.411 (0-1, higher=better)
    - Semantic Similarity: 0.267 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.244
    - Ground Truth: 0.245
    - Alignment Ratio: 99.8%



#### Best Model

In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model with r=64 and alpha=32 Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Generating & Collecting: 100%|██████████| 69/69 [18:31<00:00, 16.11s/it]


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]


    Finetuned model with r=64 Evaluation Results:
    - METEOR: 0.033 (0-1, higher=better)
    - BERTScore F1: 0.414 (0-1, higher=better)
    - Semantic Similarity: 0.285 (0-1 cosine)
    Prompt Alignment:
    - Model Responses: 0.256
    - Ground Truth: 0.245
    - Alignment Ratio: 104.5%

