In [1]:
from datasets import load_dataset
import json
from collections import Counter
from itertools import product

In [2]:
dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

## Find common start messages

In [3]:
first_messages = Counter()

for graph in dataset["train"]:
    for dialog in graph["dialogs"]:
        part, text = dialog["messages"][0]["participant"], dialog["messages"][0]["text"]
        if part == "assistant":
            first_messages[text] += 1

In [4]:
first_messages.most_common(10)

[('Hello! How can I assist you today?', 1393),
 ('Hello! How can I help you today?', 107),
 ("I'm sorry to hear you'd like to cancel your subscription. Can you please provide your account number and the reason for cancellation?",
  88),
 ("I'm sorry to hear you want to cancel your subscription. Can you please tell me why?",
  69),
 ('Hello, how can I assist you today?', 66),
 ('Hello! How can I assist you with our refund policy today?', 64),
 ('Hello! How can I assist you with your presentation today?', 47),
 ("I'm sorry to hear you're having trouble. Can you please describe the issue with the search filter?",
  43),
 ("I'm sorry to hear you want to cancel your subscription. Could you please let me know the reason for your cancellation?",
  40),
 ("Welcome! Let's get your printer installed on your home network. What is the model of your printer?",
  33)]

Common phrases are too specific, searching for common patterns to make their combinations

In [6]:
greeting_parts = set(
    [
        mes.split("! ")[0] if "! " in mes else mes.split(", ")[0]
        for mes in first_messages
        if "! " in mes or ", " in mes
    ]
)
greeting_parts = sorted(list(greeting_parts))

interrogative_parts = set(
    [
        mes.split("! ")[1] if "! " in mes else mes.split(", ")[1]
        for mes in first_messages
        if "! " in mes or ", " in mes
    ]
)
interrogative_parts = sorted(list(interrogative_parts))

In [7]:
greeting_parts[:15]

['Certainly',
 'Greetings',
 'Hello',
 'Hi',
 'Hi there',
 'Sure',
 'Thank you for wanting to provide suggestions to improve our product. What aspect would you like to address first? Features',
 'Thank you for your recent purchase',
 'To enable crossfade',
 'Welcome',
 'Welcome to our Corporate Rewards Program',
 'Welcome to our online store',
 'Welcome to our streaming service',
 'Would you like to add any special requests to your booking']

In [16]:
interrogative_parts[:5], interrogative_parts[70:75], interrogative_parts[82:87]

(['How can I assist you during this natural disaster today?',
  'How can I assist you today regarding GDPR and your data?',
  'How can I assist you today regarding NFT and digital asset policies?',
  'How can I assist you today regarding course refunds?',
  'How can I assist you today regarding delays due to weather disruptions?'],
 ['How can I help you with dark web monitoring today?',
  'How can I help you with maintenance services?',
  'How can I help you with your product?',
  'How may I help you today?',
  'How may I help you with disability support services today?'],
 ['I can help you enable data saver mode. Would you like to enable it for all your apps or select specific ones?',
  'I can help you enable screen mirroring for your presentation. Would you like to proceed?',
  'I can help you request a copy of your signed contract. Could you please provide your contract number?',
  'I can help you review our Terms of Service regarding data usage. Would you like to proceed?',
  'I ca

In [None]:
chosen_greeting_parts = ["Greetings", "Hello", "Hi", "Welcome to our assistant service"]
chosen_interrogative_parts = [
    "How can I assist you?",
    "How can I help you?",
    "Would you like to do this?",
    "Could you tell me this?",
]

start_nodes = [
    "! ".join(pair)
    for pair in product(chosen_greeting_parts, chosen_interrogative_parts)
]
with open("start_turns.json", "w", encoding="utf-8") as file:
    json.dump(start_nodes, file, ensure_ascii=False, indent=4)

## Find common end messages

In [17]:
end_messages = Counter()

for graph in dataset["train"]:
    for dialog in graph["dialogs"]:
        part, text = (
            dialog["messages"][-1]["participant"],
            dialog["messages"][-1]["text"],
        )
        if part == "assistant":
            end_messages[text] += 1

In [18]:
closing_phrases = [phrase for phrase, _ in end_messages.most_common(10)]
closing_phrases

['Thank you for contacting us. Have a great day!',
 "You're welcome! Have a great day.",
 "Request confirmed. We're here to help if you have any other needs.",
 "You're welcome! Have a great day!",
 'Alright, if you need any further assistance, feel free to reach out. Have a great day!',
 'Alright, feel free to reach out if you need anything else. Have a great day!',
 'Alright, if you need anything else, feel free to reach out. Have a great day!',
 "I'm sorry to see you go. Your subscription has been canceled. If you have any feedback, feel free to reach out to us.",
 'Alright, if you have any other questions in the future, feel free to reach out. Have a great day!',
 'Alright, if you need any further assistance, feel free to reach out. Have a great presentation!']

In [None]:
with open("end_turns.json", "w", encoding="utf-8") as file:
    json.dump(closing_phrases, file, ensure_ascii=False, indent=4)