In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

base_model_name = "microsoft/phi-4-mini-instruct"
fine_tuned_model_path = "C:/users/emir/models/mymodel"
data_path = "path/to/personachat_holdout.jsonl"

def calculate_cosine_similarity(base_model, fine_tuned_model, tokenizer, dataset, device="cuda", max_length=50):
    base_model.to(device)
    fine_tuned_model.to(device)
    base_model.eval()
    fine_tuned_model.eval()

    total_similarity = 0
    num_comparisons = 0
    base_total_similarity = 0
    fine_tuned_total_similarity = 0

    for dialog in dataset:
        for turn in dialog["turns"]:
            text = turn["text"]

            input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
            with torch.no_grad():
                base_output = base_model.generate(input_ids, max_length=max_length, num_return_sequences=1)
            base_sentence = tokenizer.decode(base_output[0], skip_special_tokens=True)

            with torch.no_grad():
                fine_tuned_output = fine_tuned_model.generate(input_ids, max_length=max_length, num_return_sequences=1)
            fine_tuned_sentence = tokenizer.decode(fine_tuned_output[0], skip_special_tokens=True)

            def get_sentence_embedding(model, tokenizer, text, device):
                inputs = tokenizer(text, return_tensors="pt").to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

            base_embedding = get_sentence_embedding(base_model, tokenizer, base_sentence, device)
            fine_tuned_embedding = get_sentence_embedding(fine_tuned_model, tokenizer, fine_tuned_sentence, device)

            similarity = cosine_similarity(base_embedding, fine_tuned_embedding)[0][0]
            total_similarity += similarity
            num_comparisons += 1

            base_similarity = cosine_similarity(base_embedding, base_embedding)[0][0]
            fine_tuned_similarity = cosine_similarity(fine_tuned_embedding, fine_tuned_embedding)[0][0]

            base_total_similarity += base_similarity
            fine_tuned_total_similarity += fine_tuned_similarity

    if num_comparisons > 0:
        average_similarity = total_similarity / num_comparisons
        base_average_similarity = base_total_similarity / num_comparisons
        fine_tuned_average_similarity = fine_tuned_total_similarity / num_comparisons
    else:
        average_similarity = 0
        base_average_similarity = 0
        fine_tuned_average_similarity = 0
    return average_similarity, base_average_similarity, fine_tuned_average_similarity

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
    try:
        fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)
    except Exception as e:
        print(
            f"Ошибка при загрузке дообученной модели из {fine_tuned_model_path}. "
            f"Убедитесь, что путь к модели правильный, и модель там есть.\n"
            f"Ошибка: {e}"
        )
        exit()
    holdout_dataset = load_dataset("json", data_files=data_path, split="train")
    _, base_average_similarity, fine_tuned_average_similarity = calculate_cosine_similarity(base_model, fine_tuned_model, tokenizer, holdout_dataset)
    print(f"Среднее косинусное расстояние для базовой модели: {base_average_similarity:.4f}")
    print(f"Среднее косинусное расстояние для дообученной модели: {fine_tuned_average_similarity:.4f}")

Среднее косинусное расстояние для базовой модели: 0.5238
Среднее косинусное расстояние для дообученной модели: 0.7396
