In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from accelerate import infer_auto_device_map, dispatch_model
from datasets import load_from_disk
from tqdm import tqdm
import copy
from torch.utils.data import Dataset, DataLoader
from evaluate import load
import numpy as np
from sentence_transformers import SentenceTransformer, util


from google.colab import drive, userdata
drive.mount('/content/drive')

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install evaluate
!pip install sentence_transformers
!pip install bert_score

In [None]:
hf_token = userdata.get('HF_TOKEN')

# Now you can use hf_token to log in:
from huggingface_hub import login
login(token=hf_token)

### Loading model and data

In [None]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
#adapter_path = "./francesco_lora/checkpoint-675"
adapter_path = "/content/drive/My Drive/Digital-Self-Replica/francesco_lora/checkpoint-675"
# offload_dir = "./offload"
offload_dir = "/content/drive/My Drive/Digital-Self-Replica/offload"


tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,                                            # important: don't use "auto" yet
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "20GiB", "cpu": "28GiB"},  # adjust GPU memory to your GPU (e.g., 12, 24 GiB)
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

#base_model_copy = copy.deepcopy(base_model)

# loading LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

In [None]:
class ChatDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.encodings["labels"][idx], dtype=torch.long)
        }
        
tokenized_test = load_from_disk('/content/drive/MyDrive/Digital-Self-Replica/datasets/tokenized_test')
# tokenized_test = load_from_disk('./datasets/tokenized_test')
        
test_dataloader = ChatDataset(tokenized_test)
test_dataloader = DataLoader(test_dataloader, batch_size=16, shuffle=False)

### Perplexity

In [None]:
def calculate_perplexity(model, dataset, tokenizer, device="cuda"):
    model = model.to(device)
    model.eval()
    total_loss = 0
    total_tokens_in_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            num_tokens = (labels != -100).sum().item()

            if num_tokens == 0:
                continue

            # report metrics only if there is a response
            if num_tokens > 0:
                total_loss += loss.item() * num_tokens
                total_tokens_in_loss += num_tokens

    avg_loss = total_loss / total_tokens_in_loss
    perplexity = torch.exp(torch.tensor(avg_loss, device=device))
    return perplexity.item()

In [None]:
perplexity = calculate_perplexity(finetuned_model, test_dataloader, tokenizer)
print(f"Perplexity of Fine-Tuned Model: {perplexity}")

In [None]:
perplexity = calculate_perplexity(base_model, test_dataloader, tokenizer)
print(f"Perplexity of Base Model: {perplexity}")

### Generation analysis

In [None]:
# pad sequences to same length on the left
def left_pad(sequences, pad_value):
    max_len = max(seq.size(0) for seq in sequences)
    padded = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padded_seq = torch.cat([torch.full((pad_len,), pad_value, dtype=seq.dtype, device=seq.device), seq])
        padded.append(padded_seq)
    return torch.stack(padded)



def convert_label_to_string(label, tokenizer, skip_special_tokens=True):
    valid_token_ids = label[label != -100]
    token_list = valid_token_ids.tolist()
    text = tokenizer.decode(token_list, skip_special_tokens=skip_special_tokens)
    return text



# function to print N prompts, responses and ground truths, given a batch of prompts, responses, and ground truths

def print_batch_debug(batch_prompts, responses, ground_truths, tokenizer, N=3):
    """
    Prints the first N examples in the batch, showing:
      - the prompt (without response)
      - the generated response
      - the ground truth response
    """

    # Number of examples to print
    to_print = min(N, len(batch_prompts))
    for idx in range(to_print):
        # 1) decode the prompt (batch_prompts[idx] has no padding at front)
        prompt_ids = batch_prompts[idx].tolist()
        prompt_txt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

        # 2) decode the generated response
        gen_ids = responses[idx].tolist()
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True)

        # 3) grab the ground truth from your precomputed list
        gt_txt = ground_truths[idx]

        # 4) print neatly
        print(f"{'-'*10} Example {idx+1} {'-'*10}")
        print(f"Prompt:\n{prompt_txt}")
        print(f"\nGenerated:    {gen_txt}")
        print(f"Ground Truth: {gt_txt}")
        print()


# function to print prompts, responses, and ground truths


def analyze_generation(model, dataset, tokenizer, device="cuda", print_every=10):
    model.eval()

    for batch_idx, batch in enumerate(tqdm(dataset, desc="Calculating Perplexity")):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        ground_truths = []
        for label in labels:
            gt_txt = convert_label_to_string(label, tokenizer)
            ground_truths.append(gt_txt)

        batch_prompts = []
        for i in range(input_ids.size(0)):
            prompt_tokens = input_ids[i][labels[i] == -100]
            batch_prompts.append(prompt_tokens)

        pad_token_id = tokenizer.pad_token_id
        padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)


        # generating responses
        model.eval()
        with torch.no_grad():
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                num_return_sequences=1
            )


        # remove prompt to get only responses
        responses = []
        for gen, prompt in zip(generated, padded_prompts):
            gen_response = gen[len(prompt):]  # Slice off the prompt part
            responses.append(gen_response)

        # decoded_responses = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        #print(decoded_responses)
        # input_ids_not_padded = [torch.cat([prompt, response]) for prompt, response in zip(padded_prompts, responses)]
        # input_ids = left_pad(input_ids_not_padded, pad_token_id).to(device)

        #decoded_inputs = [tokenizer.decode(input, skip_special_tokens=False) for input in input_ids]
        #print(decoded_inputs)

        attention_mask = (input_ids != pad_token_id).long()


        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )


        if (batch_idx + 1) % print_every == 0:
            print_batch_debug(padded_prompts, responses, ground_truths, tokenizer, N=3)


In [None]:
analyze_generation(finetuned_model, test_dataloader, tokenizer)

### Actual Evaluation

In [None]:
# function to evaluate the model

def evaluate_chat_model(model, tokenizer, dataloader):
    model.eval()
    results = {
        'meteor': [],
        'bertscore_f1': [],
        'semantic_similarity': [],
        'prompt_ground_truth_similarity': [],
        'prompt_pred_similarity': []
    }

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    bertscore = load("bertscore")
    meteor = load("meteor")

    #semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    semantic_model = SentenceTransformer(
        'nickprock/sentence-bert-base-italian-uncased',
        device='cpu'
    )


    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):

            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # getting the prompts only (no responses)
            batch_prompts = []
            for i in range(input_ids.size(0)):
                prompt_tokens = input_ids[i][labels[i] == -100]
                batch_prompts.append(prompt_tokens)

            pad_token_id = tokenizer.pad_token_id
            padded_prompts = left_pad(batch_prompts, pad_token_id).to(device)

            # Extract ground truth responses (excluding -100 labels)
            ground_truths = []
            for label in labels:
                gt_txt = convert_label_to_string(label, tokenizer)
                ground_truths.append(gt_txt)

            # --- Generate Responses ---
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=True,
            )

            sequences = generated.sequences             # (B, prompt_length + response_length)
            prompt_length = padded_prompts.size(1)
            responses_tokens = sequences[:, prompt_length:]
            pred_responses = tokenizer.batch_decode(responses_tokens, skip_special_tokens=True)



            # --- Calculate Metrics ---
            # METEOR
            meteor_results = meteor.compute(predictions=pred_responses, references=ground_truths)
            results['meteor'].append(meteor_results['meteor'])

            # BERTScore
            bert_results = bertscore.compute(
                predictions=pred_responses,
                references=ground_truths,
                lang='it',
                model_type='dbmdz/bert-base-italian-xxl-cased',
                num_layers=12,
                device = 'cpu',
                verbose = True,
            )
            results['bertscore_f1'].extend(bert_results['f1'])

            # Semantic Similarity
            prompt_texts = tokenizer.batch_decode(padded_prompts, skip_special_tokens=True)

            pred_embeddings = semantic_model.encode(
                pred_responses,
                convert_to_tensor=True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            gt_embeddings = semantic_model.encode(
                ground_truths,
                convert_to_tensor = True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            prompt_embeddings = semantic_model.encode(
                prompt_texts,
                convert_to_tensor = True,
                normalize_embeddings = True,
                show_progress_bar=False
            )

            similarities = np.diag(np.inner(pred_embeddings, gt_embeddings))
            results['semantic_similarity'].extend(util.cos_sim(pred_embeddings, gt_embeddings).diag().cpu().numpy())
            results['prompt_pred_similarity'].extend(util.cos_sim(prompt_embeddings, pred_embeddings).diag().cpu().numpy())
            results['prompt_ground_truth_similarity'].extend(util.cos_sim(prompt_embeddings, gt_embeddings).diag().cpu().numpy())



    # Aggregate results
    return {
        'meteor': np.mean(results['meteor']),
        'bertscore_f1': np.mean(results['bertscore_f1']),
        'semantic_similarity': np.mean(results['semantic_similarity']),
        'prompt_alignment': {
            'predicted': np.mean(results['prompt_pred_similarity']),
            'ground_truth': np.mean(results['prompt_ground_truth_similarity']),
            'ratio': (np.mean(results['prompt_pred_similarity']) /
                     np.mean(results['prompt_ground_truth_similarity']))
        },
        'predictions': pred_responses,
        'references': ground_truths
    }

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_metric
from sentence_transformers import SentenceTransformer, util

def evaluate_chat_model(model, tokenizer, dataloader, device=None):
    """
    1) We generate for each batch and append prompts↦predictions↦references to Python lists.
    2) After the loop, we score everything in one shot (METEOR once, BERTScore once, embeddings once).
    """

    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    model.eval()

    # Load the HuggingFace metrics (once)
    meteor = load_metric("meteor")
    bertscore = load_metric("bertscore")  # note: HF version can take a list in one call

    # The SentenceTransformer can stay on CPU (or GPU if small‐batch), but we will 
    # do *one* big encode() at the end with batching parameters to avoid OOM.
    semantic_model = SentenceTransformer(
        'nickprock/sentence-bert-base-italian-uncased',
        device='cpu'  # keep on CPU by default; we’ll pass batch_size in encode()
    )

    all_prompts:    list[str] = []
    all_preds:      list[str] = []
    all_references: list[str] = []

    pad_token_id = tokenizer.pad_token_id

    # 1) GENERATION LOOP: just collect strings
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating & Collecting"): 
            input_ids = batch['input_ids'].to(device)            # (B, seq_len)
            labels    = batch['labels'].to(device)               # (B, seq_len)

            # a) extract only the prompt‐tokens (where labels == -100)
            #    store as a list of (unpadded) token‐IDs per example
            batch_prompts_ids = []
            for i in range(input_ids.size(0)):
                mask = labels[i] == -100
                prompt_ids = input_ids[i][mask]                  # e.g. shape (prompt_length_i,)
                batch_prompts_ids.append(prompt_ids)

            # b) left‐pad these “prompt_ids” lists so they become a batch
            #    (you can reuse your own left_pad(...) helper; the result is (B, max_prompt_len))
            padded_prompts = left_pad(batch_prompts_ids, pad_token_id).to(device)

            # c) decode the padded prompts to text so we can re‐score prompt_embedding later
            prompts_text = tokenizer.batch_decode(
                padded_prompts, skip_special_tokens=True
            )
            all_prompts.extend(prompts_text)  # keep for later

            # d) extract ground‐truth response strings from labels (ignoring -100)
            #    convert_label_to_string(...) should map from a label‐tensor→string
            #    (e.g. decode all tokens != -100 into one string)
            ground_truths = [
                convert_label_to_string(label, tokenizer) 
                for label in labels
            ]
            all_references.extend(ground_truths)

            # e) generate new responses (same as before)
            generated = model.generate(
                input_ids=padded_prompts,
                attention_mask=(padded_prompts != pad_token_id).long(),
                max_new_tokens=40,
                pad_token_id=pad_token_id,
                eos_token_id=pad_token_id,
                do_sample=True,
                top_p=0.95,
                temperature=0.4,
                return_dict_in_generate=True,
                output_scores=False   # we don’t actually need scores
            )

            sequences = generated.sequences                      # (B, prompt_len+gen_len)
            gen_tokens = sequences[:, padded_prompts.size(1):]    # just the “response” part
            decoded_preds = tokenizer.batch_decode(
                gen_tokens, skip_special_tokens=True
            )
            all_preds.extend(decoded_preds)

    # At this point, we’ve built three parallel lists (length = total_examples):
    #   all_prompts[i], all_preds[i], all_references[i].

    # 2) METEOR (compute once on full lists)
    meteor_res = meteor.compute(
        predictions=all_preds,
        references=all_references
    )
    avg_meteor = meteor_res["meteor"]

    # 3) BERTScore (compute once; use batches to reduce memory spikes)
    #    We can force bertscore to run on CPU or GPU with a small batch_size. 
    #    If you still get OOM on GPU, switch to device="cpu" + batch_size=32.
    bert_res = bertscore.compute(
        predictions=all_preds,
        references=all_references,
        lang="it",
        model_type="dbmdz/bert-base-italian-xxl-cased",
        num_layers=12,
        device="cpu",       # or "cuda" if you have ≥16 GB free, plus a small batch
        batch_size=16,      # cut into smaller chunks inside
        rescale_with_baseline=False
    )
    avg_bertscore_f1 = float(np.mean(bert_res["f1"]))

    # 4) SEMANTIC SIMILARITY + PROMPT ALIGNMENT
    #    We already have all_prompts, all_preds, all_references.
    #    Encode them *once* with a controlled batch_size to avoid OOM:
    #
    #    - Setting device="cpu" will keep memory usage lower. 
    #    - If you have a free GPU behind the scenes, you could move semantic_model to "cuda" 
    #      and do encode(..., batch_size=32) there. But let’s assume CPU to be safe.

    # a) Compute embeddings in one pass (auto‐sliced into mini‐batches inside)
    prompt_embeds = semantic_model.encode(
        all_prompts,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,            # you can tweak this up/down
        show_progress_bar=True
    )
    pred_embeds = semantic_model.encode(
        all_preds,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,
        show_progress_bar=True
    )
    ref_embeds = semantic_model.encode(
        all_references,
        convert_to_tensor=True,
        normalize_embeddings=True,
        batch_size=64,
        show_progress_bar=True
    )

    # b) Cosine‐similarity vectors (diagonal of pairwise similarities)
    #    use torch or numpy; here we do util.cos_sim and take diag
    sim_pred_ref = util.cos_sim(pred_embeds, ref_embeds).diagonal().cpu().numpy()
    sim_prompt_pred = util.cos_sim(prompt_embeds, pred_embeds).diagonal().cpu().numpy()
    sim_prompt_ref  = util.cos_sim(prompt_embeds, ref_embeds).diagonal().cpu().numpy()

    avg_semantic_similarity = float(np.mean(sim_pred_ref))
    avg_prompt_pred       = float(np.mean(sim_prompt_pred))
    avg_prompt_ref        = float(np.mean(sim_prompt_ref))
    prompt_alignment_ratio = avg_prompt_pred / (avg_prompt_ref + 1e-12)

    # 5) Return all the aggregated metrics + the final lists (if you really need them)
    return {
        "meteor": avg_meteor,
        "bertscore_f1": avg_bertscore_f1,
        "semantic_similarity": avg_semantic_similarity,
        "prompt_alignment": {
            "predicted": avg_prompt_pred,
            "ground_truth": avg_prompt_ref,
            "ratio": prompt_alignment_ratio
        },
        "predictions": all_preds,
        "references": all_references
    }


In [None]:
results = evaluate_chat_model(base_model, tokenizer, test_dataloader)

print(f"""
    Baseline Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")

In [None]:
results = evaluate_chat_model(finetuned_model, tokenizer, test_dataloader)

print(f"""
    Finetuned model Evaluation Results:
    - METEOR: {results['meteor']:.3f} (0-1, higher=better)
    - BERTScore F1: {results['bertscore_f1']:.3f} (0-1, higher=better)
    - Semantic Similarity: {results['semantic_similarity']:.3f} (0-1 cosine)
    Prompt Alignment:
    - Model Responses: {results['prompt_alignment']['predicted']:.3f}
    - Ground Truth: {results['prompt_alignment']['ground_truth']:.3f}
    - Alignment Ratio: {results['prompt_alignment']['ratio']:.1%}
""")