In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
from dialogue2graph.pipelines.model_storage import ModelStorage
from dialogue2graph.metrics.validators import START_TURNS, END_TURNS

from langchain.evaluation import load_evaluator

2025-04-07 13:19:57,305 - datasets - INFO - PyTorch version 2.6.0+cu118 available.


In [3]:
ms = ModelStorage()
ms.add(
    "my_emb",
    config={
        "model_name": 'BAAI/bge-m3',
        "device": 'cuda'
    },
    model_type="emb",
)
ms.add(
    "my_llm",
    config={
        "name": "gpt-4o-mini"
    },
    model_type="llm"
)

2025-04-07 13:19:57,806 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: BAAI/bge-m3
2025-04-07 13:20:05,142 - dialogue2graph.pipelines.model_storage - INFO - Added emb model 'my_emb' to storage
2025-04-07 13:20:05,759 - dialogue2graph.pipelines.model_storage - INFO - Added llm model 'my_llm' to storage


In [4]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

## Measure pairwise distance between chosen start turns and dialog turns

In [5]:
from tqdm import tqdm

In [6]:
embedder_model = ms.storage['my_emb'].model
EVAL = load_evaluator("pairwise_embedding_distance", embeddings=embedder_model)

def get_distance(messages, start_turn):
    for i, message in enumerate(messages):
        if message['participant'] == 'assistant':
            score = EVAL.evaluate_string_pairs(prediction=start_turn, prediction_b=message['text'])['score']
            if i == 0:
                yield {"start_distance": score}
            else:
                yield {"non-start_distance": score}

In [7]:
start_distance_storage = {turn: {"start_distance": [], "non-start_distance": []} for turn in START_TURNS}

In [8]:
for graph in tqdm(list(dataset['train'])[:100]):
    for dialog in graph['dialogues'][:5]:
        for turn in START_TURNS:
            for d in get_distance(dialog['messages'], turn):
                if d.get("start_distance"):
                    start_distance_storage[turn]['start_distance'].append(d['start_distance'])
                elif d.get("non-start_distance"):
                    start_distance_storage[turn]['non-start_distance'].append(d['non-start_distance'])

100%|██████████| 100/100 [39:40<00:00, 23.81s/it]


In [9]:
import pandas as pd
import numpy as np

In [10]:
data = []
for d in start_distance_storage.values():
    mean_sd = np.array(d['start_distance']).mean()
    mean_non_sd = np.array(d['non-start_distance']).mean()
    data.append((mean_sd, mean_non_sd))
df = pd.DataFrame(data=data, columns=['start_distance', 'non-start_distance'], index=START_TURNS)

In [11]:
df.describe()

Unnamed: 0,start_distance,non-start_distance
count,16.0,16.0
mean,0.260126,0.435214
std,0.052633,0.011161
min,0.184069,0.415298
25%,0.204727,0.426467
50%,0.268403,0.435617
75%,0.304425,0.443379
max,0.333301,0.453565


## Measure pairwise distance between chosen end turns and dialog turns

In [15]:
end_distance_storage = {turn: {"end_distance": [], "non-end_distance": []} for turn in END_TURNS}

In [18]:
embedder_model = ms.storage['my_emb'].model
EVAL = load_evaluator("pairwise_embedding_distance", embeddings=embedder_model)

def get_distance_end(messages, end_turn):
    last_turn_idx = len(messages) - 1
    for i, message in enumerate(messages):
        if message['participant'] == 'assistant':
            score = EVAL.evaluate_string_pairs(prediction=end_turn, prediction_b=message['text'])['score']
            if i == last_turn_idx:
                yield {"end_distance": score}
            else:
                yield {"non-end_distance": score}

In [19]:
for graph in tqdm(list(dataset['train'])[:100]):
    for dialog in graph['dialogues'][:4]:
        for turn in END_TURNS:
            for d in get_distance_end(dialog['messages'], turn):
                if d.get("end_distance"):
                    end_distance_storage[turn]['end_distance'].append(d['end_distance'])
                elif d.get("non-end_distance"):
                    end_distance_storage[turn]['non-end_distance'].append(d['non-end_distance'])

100%|██████████| 100/100 [13:35<00:00,  8.15s/it]


In [20]:
data = []
for d in end_distance_storage.values():
    mean_sd = np.array(d['end_distance']).mean()
    mean_non_sd = np.array(d['non-end_distance']).mean()
    data.append((mean_sd, mean_non_sd))
df_end = pd.DataFrame(data=data, columns=['end_distance', 'non-end_distance'], index=END_TURNS)

In [21]:
df_end.describe()

Unnamed: 0,end_distance,non-end_distance
count,10.0,10.0
mean,0.270836,0.469646
std,0.054072,0.021954
min,0.218979,0.419719
25%,0.23289,0.463616
50%,0.26357,0.470336
75%,0.275185,0.47774
max,0.404094,0.498146


## Try new validator

In [4]:
from dialogue2graph.metrics.validators import is_greeting_repeated_emb_llm, is_dialog_closed_too_early_emb_llm
from dialogue2graph import Dialogue

In [5]:
dialogue_template = Dialogue()
good_graph = [dialogue_template.from_list(dialog['messages']) for dialog in dataset['train'][10]['dialogues']]
hello_x2_graph = [dialogue_template.from_list(dialog['messages']) for dialog in dataset['train'][350]['dialogues']]
bye_x2_graph = [dialogue_template.from_list(dialog['messages']) for dialog in dataset['train'][161]['dialogues']]

In [6]:
is_greeting_repeated_emb_llm(good_graph, ms, "my_emb", "my_llm") # False

False

In [7]:
is_greeting_repeated_emb_llm(hello_x2_graph, ms, "my_emb", "my_llm") # True

2025-04-07 11:26:48,935 - httpx - INFO - HTTP Request: POST http://193.187.173.33:8002/api/providers/openai/v1/chat/completions "HTTP/1.1 200 OK"


True

In [6]:
is_dialog_closed_too_early_emb_llm(good_graph, ms, "my_emb", "my_llm") # False

False

In [7]:
is_dialog_closed_too_early_emb_llm(bye_x2_graph, ms, "my_emb", "my_llm") # True

2025-04-07 11:30:09,144 - httpx - INFO - HTTP Request: POST http://193.187.173.33:8002/api/providers/openai/v1/chat/completions "HTTP/1.1 200 OK"


Understood. If you change your mind, feel free to reach out. Have a good day! True


True