In [1]:
import json
import logging
from datasets import load_dataset

%load_ext autoreload
%autoreload 2

In [3]:
from chatsky_llm_autoconfig.settings import EnvSettings
from chatsky_llm_autoconfig.algorithms.three_stages_1i import (
    ThreeStagesGraphGenerator as GeneratorI,
)
from chatsky_llm_autoconfig.algorithms.three_stages_1 import (
    ThreeStagesGraphGenerator as Generator,
)
from chatsky_llm_autoconfig.algorithms.three_stages_0i import (
    ThreeStagesGraphGenerator as GeneratorI0,
)
from chatsky_llm_autoconfig.dialogue import Dialogue
from chatsky_llm_autoconfig.graph import Graph
from chatsky_llm_autoconfig.algorithms.dialogue_generation import (
    RecursiveDialogueSampler,
)
from chatsky_llm_autoconfig.metrics.automatic_metrics import all_utterances_present

In [4]:
logging.getLogger("langchain_core.vectorstores.base").setLevel(logging.ERROR)
env_settings = EnvSettings()

In [None]:
graph_generator = Generator("o3-mini")  # To generate 1st graph from one dialogue
graph_generator_i = (
    GeneratorI()
)  # To add dialogue to existing graph: adding nodes with LLM and edges with embedder
graph_generator_i0 = GeneratorI0()  # To add dialogue to existing graph with LLM

## SCHEMA Dataset

In [18]:
dataset = load_dataset("GEM/schema_guided_dialog", trust_remote_code=True)

In [None]:
dataset

In [184]:
len([d for d in dataset["test"] if d["service"] == "RentalCars_3"])

932

In [19]:
new_data = []
for d in [el for el in dataset["test"] if el["service"] == "RentalCars_3"]:
    exist = (
        [[{"text": "Hello! How can I help you?", "participant": "assistant"}]]
        + [
            [
                {"text": u, "participant": "user"},
                {"text": a, "participant": "assistant"},
            ]
            for u, a in zip(d["context"][0:-1:2], d["context"][1::2])
        ]
        + [
            [
                {"text": d["prompt"], "participant": "user"},
                {"text": d["target"], "participant": "assistant"},
            ]
        ]
    )
    new_data.append([x for xs in exist for x in xs])

In [None]:
new_data[:1]

#### 1 dialogue

In [None]:
test_data = [Dialogue.from_list(c) for c in new_data[:1]]
graph = graph_generator.invoke(test_data)
all_utterances_present(graph, test_data)

In [None]:
graph.visualise_short("1 rent a car dialogue")

In [None]:
graph.graph_dict["nodes"]

In [None]:
graph.graph_dict["edges"]

In [None]:
new_data[1]

##### <span style="color:green">Result: OK, but nodes 3 and 4 could be cobined</span>

#### 2 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[1]]]
graph_2, dialogues = graph_generator_i.invoke(test_data, graph)
all_utterances_present(graph_2, dialogues)

In [None]:
graph_2.visualise_short("2 rent a car dialogues")

In [None]:
graph_2.graph_dict["nodes"]

In [None]:
graph_2.graph_dict["edges"]

##### <span style="color:yellow">Result: node 8 is wrongly combined</span>

In [None]:
new_data[2]

#### Adding 3-d dialogue with pure LLM

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[2]]]
graph_3 = graph_generator_i0.invoke(test_data, graph_2)

In [None]:
graph_3.visualise_short("3 rent a car dialogues")

##### <span style="color:red">Result: nodes are separated</span>

#### 3 dialogues

In [None]:
new_data[2]

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[2]]]
graph_3, dialogues = graph_generator_i.invoke(test_data, graph_2)
all_utterances_present(graph_3, dialogues)

##### <span style="color:red">Result: LLM lost nodes</span>

## MSR-E2E dataset

In [6]:
import pandas as pd

In [7]:
e2e_data = pd.read_csv(
    "../../../datasets/e2e_dialog_challenge/data/taxi_all.tsv", delimiter="\t"
)

In [8]:
new_data = []
session = 1
cur = [{"text": "Hello! How can I help you?", "participant": "assistant"}]
for idx, row in e2e_data.iterrows():
    if row["session.ID"] != session:
        new_data.append(cur)
        cur = [{"text": "Hello! How can I help you?", "participant": "assistant"}]
    if row["Message.From"] == "agent":
        cur.append({"text": row["Message.Text"], "participant": "assistant"})
    else:
        cur.append({"text": row["Message.Text"], "participant": row["Message.From"]})
    session = row["session.ID"]

In [None]:
new_data[:10]

#### 1 dialogue

In [None]:
test_data = [Dialogue.from_list(c) for c in new_data[:1]]
graph = graph_generator.invoke(test_data)
all_utterances_present(graph, test_data)

In [None]:
graph.visualise_short("1 taxi dialogues")

In [None]:
graph.graph_dict["nodes"]

In [None]:
graph.graph_dict["edges"]

##### <span style="color:green">Result: OK</span>

#### 2 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[1]]]
graph_2, dialogues = graph_generator_i.invoke(test_data, graph)
all_utterances_present(graph_2, dialogues)

In [None]:
graph_2.visualise_short("2 taxi dialogues")

In [None]:
graph_2.graph_dict["nodes"]

In [152]:
graph_2.graph_dict["edges"]

[{'source': 2, 'target': 5, 'utterances': ['tomorrow']},
 {'source': 5, 'target': 9, 'utterances': ['seattle wa']},
 {'source': 9, 'target': 9, 'utterances': ['try anyplace close to that']},
 {'source': 1,
  'target': 2,
  'utterances': ['i need a cars for a group of 3 if possible. they will be going to 3456 executive row from 45621 chatter way at 3pm.',
   'i would like to book a cab please']},
 {'source': 2, 'target': 3, 'utterances': ['today']},
 {'source': 3, 'target': 4, 'utterances': ['2']},
 {'source': 4, 'target': 5, 'utterances': ['11:00 pm']},
 {'source': 5, 'target': 6, 'utterances': ['new york, ny']},
 {'source': 6, 'target': 7, 'utterances': ['statue of liberty']},
 {'source': 7, 'target': 8, 'utterances': ['yes, thanks']}]

##### <span style="color:green">Result: OK</span>

#### 3 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[2]]]
graph_3, dialogues = graph_generator_i.invoke(test_data, graph_2)
all_utterances_present(graph_3, dialogues)

In [None]:
graph_3.visualise_short("3 taxi dialogues")

In [None]:
graph_3.graph_dict["nodes"]

In [None]:
graph_3.graph_dict["edges"]

##### <span style="color:green">Result: OK</span>

#### 4 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[3]]]
graph_4, dialogues = graph_generator_i.invoke(test_data, graph_3)
all_utterances_present(graph_4, dialogues)

In [None]:
graph_4.visualise_short("4 taxi dialogues")

In [None]:
graph_4.graph_dict["nodes"]

In [None]:
graph_4.graph_dict["edges"]

##### <span style="color:yellow">Result: edge 8->8 is wrong</span>

#### 5 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[4]]]
graph_5, dialogues = graph_generator_i.invoke(test_data, graph_4)
all_utterances_present(graph_5, dialogues)

In [None]:
graph_5.visualise_short("5 taxi dialogues")

In [None]:
graph_5.graph_dict["nodes"]

In [None]:
graph_5.graph_dict["edges"]

##### <span style="color:yellow">Result: edge 8->8 is wrong</span>

#### 6 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[5]]]
graph_6, dialogues = graph_generator_i.invoke(test_data, graph_5)
all_utterances_present(graph_6, dialogues)

In [None]:
graph_6.visualise_short("6 taxi dialogues")

In [None]:
graph_6.graph_dict["nodes"]

In [None]:
graph_6.graph_dict["edges"]

##### <span style="color:yellow">Result: edge 8->8 is wrong</span>

#### 7 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[6]]]
graph_7, dialogues = graph_generator_i.invoke(test_data, graph_6)
all_utterances_present(graph_7, dialogues)

In [None]:
graph_7.visualise_short("7 taxi dialogues")

In [None]:
graph_7.graph_dict["nodes"]

In [None]:
graph_7.graph_dict["edges"]

##### <span style="color:yellow">Result: edge 8->8 is wrong</span>

#### 8 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[7]]]
graph_8, dialogues = graph_generator_i.invoke(test_data, graph_7)
all_utterances_present(graph_8, dialogues)

In [None]:
graph_8.visualise_short("8 taxi dialogues")

In [None]:
graph_8.graph_dict["nodes"]

In [None]:
graph_8.graph_dict["edges"]

##### <span style="color:yellow">Result: edge 8->8 is wrong</span>

#### Extending 8 dialogues graph to 9 with pure LLM method

In [9]:
graph_8_dict = {
    "nodes": [
        {
            "id": 1,
            "label": "start",
            "is_start": True,
            "utterances": ["Hello! How can I help you?"],
        },
        {
            "id": 2,
            "label": "ask_date",
            "is_start": False,
            "utterances": [
                "On what date would you like a taxi?",
                "What day do you need the taxi?",
                "At what date would you like to reserve?",
            ],
        },
        {
            "id": 3,
            "label": "ask_num",
            "is_start": False,
            "utterances": [
                "How many are going?",
                "Hello, how many guests will you have?",
                "How many passengers will there be?",
            ],
        },
        {
            "id": 4,
            "label": "ask_time",
            "is_start": False,
            "utterances": [
                "What time would you like to be picked up?",
                "Is that 6AM or 6PM?",
            ],
        },
        {
            "id": 5,
            "label": "ask_city_state",
            "is_start": False,
            "utterances": [
                "Sure. what city are you located in?",
                "What city are you located in?",
                "What City and State would you like me to look?",
                "Hi. What city are you located in?",
                "Could you please provide a city and state?",
            ],
        },
        {
            "id": 6,
            "label": "ask_destination",
            "is_start": False,
            "utterances": [
                "What are you pick up and drop off addresses?",
                "I'll need you pick up and drop off locations.",
                "Hi, I also need to know your drop off location.",
                "Where would you like to go?",
                "What are the addresses of your pick up and drop off locations?",
            ],
        },
        {
            "id": 7,
            "label": "ask_price_confirmation",
            "is_start": False,
            "utterances": [
                "UberX would cost $17-23. Would you like to book that, or explore other Uber options?",
                "$34-35 is the price for an uberX. Would you like this option?",
                "$53-64 is the price for an uberX. Would you like this option?",
                "That ride would cost $29-39 on UberX. Would you like to book that now or explore other Uber options?",
                "$10-13 is the price for an uberX. Would you like this option?",
                "$5-6 is the price for an uberX. Would you like this option?",
                "$11-15 is the price for an uberX. Would you like this option?",
            ],
        },
        {
            "id": 8,
            "label": "confirmation",
            "is_start": False,
            "utterances": [
                "I booked your UberX ride as requested, have a safe trip!",
                "I have successfully booked your uber. Enjoy your ride.",
                "Your UberX ride was confirmed, have a safe trip!",
                "I have confirmed your uberX. Thank you and have a safe trip.",
                "Your reservation was confirmed, enjoy your ride with UberX!",
                "Thanks for using our service!",
                "Your reservation is confirmed! Thank you for using our service.",
            ],
        },
        {
            "id": 9,
            "label": "ask_address_verification",
            "is_start": False,
            "utterances": [
                "I'm sorry, I cannot find a Chatter Way in seattle at all. Perhaps you could provide a nearby street name?",
                "I found many Elm Streets in New York. Could you please specify your location?",
                "I am not seeing a Hilton downtown. I do see a Doubletree by Hilton, however. Could you verify what is correct?",
                "What's the address or city of the superdome?",
                "I can't seem to find 45621 chatter way in Seattle. Is this address correct?",
            ],
        },
        {
            "id": 10,
            "label": "thank_you",
            "is_start": False,
            "utterances": ["Thank you."],
        },
        {
            "id": 11,
            "label": "ask_uber_estimate",
            "is_start": False,
            "utterances": [
                "We're currently only booking through uber. Would you like an estimate for that?"
            ],
        },
        {
            "id": 12,
            "label": "ask_time_and_passengers",
            "is_start": False,
            "utterances": ["I also need a time along with number of passengers."],
        },
        {
            "id": 13,
            "label": "ask_name",
            "is_start": False,
            "utterances": ["And may I have your name?"],
        },
    ],
    "edges": [
        {
            "source": 1,
            "target": 6,
            "utterances": ["i need a taxi for 5 people at 54 elm street at 6 pm."],
        },
        {"source": 2, "target": 9, "utterances": ["friday."]},
        {"source": 5, "target": 2, "utterances": ["new york."]},
        {
            "source": 6,
            "target": 5,
            "utterances": ["i need to be dropped at 450 beacon street."],
        },
        {"source": 9, "target": 13, "utterances": ["brooklyn."]},
        {"source": 13, "target": 7, "utterances": ["mike."]},
        {
            "source": 1,
            "target": 11,
            "utterances": ["what is the highest rated taxi service in wilmington, nc"],
        },
        {"source": 8, "target": 10, "utterances": ["thanks.", "thanks!"]},
        {"source": 11, "target": 6, "utterances": ["yes please"]},
        {
            "source": 7,
            "target": 8,
            "utterances": [
                "yes.",
                "yes that is fine",
                "yes",
                "please book it now.",
                "yes, please",
                "yes, thanks",
            ],
        },
        {"source": 3, "target": 7, "utterances": ["2", "1"]},
        {"source": 2, "target": 3, "utterances": ["saturday at 3pm", "today"]},
        {
            "source": 6,
            "target": 2,
            "utterances": [
                "hilton downtown to the melting pot restaurant",
                "i'm at the superdome and need to get to louis armstrong airport",
            ],
        },
        {"source": 2, "target": 5, "utterances": ["tomorrow"]},
        {"source": 5, "target": 9, "utterances": ["seattle wa"]},
        {
            "source": 9,
            "target": 7,
            "utterances": [
                "that is probably iot",
                "1500 sugar bowl dr, new orleans, la 70112",
            ],
        },
        {
            "source": 1,
            "target": 2,
            "utterances": [
                "i need a cars for a group of 3 if possible. they will be going to 3456 executive row from 45621 chatter way at 3pm.",
                "i would like to book a cab please",
            ],
        },
        {"source": 2, "target": 12, "utterances": ["next monday"]},
        {"source": 12, "target": 9, "utterances": ["6pm for 4 people"]},
        {
            "source": 1,
            "target": 3,
            "utterances": [
                "i need to go to denver airport. can you get a taxi to meet at coors field in denver in an hour?"
            ],
        },
        {"source": 3, "target": 4, "utterances": ["2"]},
        {"source": 4, "target": 3, "utterances": ["6pm"]},
        {
            "source": 1,
            "target": 4,
            "utterances": [
                "need a cab to take me to the uncw baseball stadium from days inn on market street in wilmington, nc tomorrow at 6"
            ],
        },
        {"source": 4, "target": 5, "utterances": ["11:00 pm"]},
        {"source": 9, "target": 9, "utterances": ["try anyplace close to that"]},
        {
            "source": 1,
            "target": 5,
            "utterances": [
                "i would like to book a cab for tomorrow night at 6 please",
                "hi! i need a taxi.",
            ],
        },
        {
            "source": 6,
            "target": 3,
            "utterances": [
                "pick up at the north carolina museum of art and drop off at rdu"
            ],
        },
        {"source": 3, "target": 3, "utterances": ["just myself"]},
        {"source": 8, "target": 8, "utterances": ["thank you!"]},
        {"source": 3, "target": 9, "utterances": ["3"]},
        {"source": 6, "target": 7, "utterances": ["statue of liberty"]},
        {
            "source": 5,
            "target": 6,
            "utterances": ["raliegh", "new orleans", "new york, ny"],
        },
    ],
}

In [10]:
graph_8 = Graph(graph_8_dict)

In [11]:

dialogue_sampler = RecursiveDialogueSampler()

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[8]]]
graph_9 = graph_generator_i0.invoke(test_data, graph_8)

In [14]:
graph_9

Graph(graph_dict={}, graph=None, node_mapping=None)

In [None]:
dialogues = dialogue_sampler.invoke(graph_9, 5)
all_utterances_present(graph_9, dialogues)

In [None]:
graph_9.visualise_short("9 dialogues")

##### <span style="color:red">Result: nodes are separated</span>

#### 9 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[8]]]
graph_9, dialogues = graph_generator_i.invoke(test_data, graph_8)
all_utterances_present(graph_9, dialogues)
print(graph_9.graph_dict["nodes"])
print("\n")
print(graph_9.graph_dict["edges"])
print("\n")
graph_9.visualise_short("9 taxi dialogues")

In [None]:
graph_9.visualise_short("9 taxi dialogues")

In [None]:
graph_9.graph_dict["nodes"]

In [None]:
graph_9.graph_dict["edges"]

##### <span style="color:red">Result: nodes are lost</span>

#### 10 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [new_data[9]]]
graph_10, dialogues = graph_generator_i.invoke(test_data, graph_9)
all_utterances_present(graph_10, dialogues)

In [None]:
graph_10.visualise_short("10 taxi dialogues")

In [None]:
graph_10.graph_dict["nodes"]

In [None]:
graph_10.graph_dict["edges"]

## Frames dataset

In [126]:
with open("../../../datasets/frames/data.json") as f:
    dataset = json.load(f)

In [None]:
dataset

In [127]:
data = []
for d in [el["turns"] for el in dataset]:
    exist = [{"text": "Hello! How can I help you?", "participant": "assistant"}] + [
        {"text": u["text"], "participant": "user"}
        if u["author"] == "user"
        else {"text": u["text"], "participant": "assistant"}
        for u in d
    ]
    data.append(exist)

In [None]:
data[0]

#### 1 dialogue

In [None]:
test_data = [Dialogue.from_list(c) for c in data[:1]]
graph = graph_generator.invoke(test_data)
all_utterances_present(graph, test_data)

In [None]:
graph.visualise_short("1 trip dialogue")

In [None]:
graph.graph_dict["nodes"]

In [None]:
graph.graph_dict["edges"]

##### <span style="color:green">Result: OK</span>

#### 2 dialogues

In [None]:
test_data = [Dialogue.from_list(c) for c in [data[1]]]
graph_2, dialogues = graph_generator_i.invoke(test_data, graph)
all_utterances_present(graph_2, dialogues)

In [None]:
graph_2.visualise_short("2 trip dialogues")

In [None]:
graph_2.graph_dict["nodes"]

In [None]:
graph_2.graph_dict["edges"]

##### <span style="color:red">Result: nodes 2 and 4 wrongly combined</span>

#### 3 dialogues

In [None]:
new_data[2]

In [None]:
test_data = [Dialogue.from_list(c) for c in [data[2]]]
graph_3, dialogues = graph_generator_i.invoke(test_data, graph_2)
all_utterances_present(graph_3, dialogues)

In [None]:
graph_3.visualise_short("3 trip dialogues")

In [None]:
graph_3.graph_dict["nodes"]

In [None]:
graph_3.graph_dict["edges"]

##### <span style="color:red">Result: nodes 2 and 4 wrongly combined</span>