In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/d2g_generated", token=True)

In [4]:
import re

def match_greeting(message: dict):
    return bool(
        re.match(r'^hello|^hi',
                 message['text'],
                 flags=re.IGNORECASE
        )
    ) and message['participant'] == 'assistant'

In [5]:
idx_duplicates = []
for i, ex in enumerate(dataset['train']):
    for j, dia in enumerate(ex['dialogues']):
        num_greeting = sum([match_greeting(mes) for mes in dia['messages']])
        if num_greeting > 1:
            idx_duplicates.append((i, j))

In [6]:
len(idx_duplicates)

112

In [7]:
class DuplicateDialogues:

    def __init__(self, data: list, 
                 idx_duplicates: list):
        self.data = data
        self.idx_duplicates = idx_duplicates
        self.idx_cycled_graph = list(set([i[0] for i in idx_duplicates]))
        self.topic2id = {self.data[i]['topic']: i for i in self.idx_cycled_graph}

    def get_duplicate_dialogue(self, idx):
        idx_topic, idx_dialogue = idx_duplicates[idx]
        return self.data[idx_topic]['dialogues'][idx_dialogue]['messages']
    
    def check_cycle(self, idx):
        idx_graph = self.idx_cycled_graph[idx]
        cycle_edges = [edge for edge in self.data[idx_graph]['graph']['edges'] if edge['target'] == 1]
        if cycle_edges:
            return True
        return False

    def find_imposter_transitions(self):
        transition_storage = []
        for i in self.idx_cycled_graph:
            for edge in self.data[i]['graph']['edges']:
                if edge['target'] == 1:
                    transition_storage.extend(edge['utterances'])
        return list(set(transition_storage))
    
    def find_imposter_labels(self):
        strange_assistant_answers_labels = []
        for i in self.idx_cycled_graph:
            strange_assistant_answers = [e['source'] for e in self.data[i]['graph']['edges'] if e['target'] == 1]
            s_answers_labels = [n['label'] for n in self.data[i]['graph']['nodes'] if n['id'] in strange_assistant_answers]
            strange_assistant_answers_labels.extend(s_answers_labels)
        return list(set(strange_assistant_answers_labels))
    
    def find_damaged_topics(self):
        damaged_topics = [self.data[i]['topic'] for i in self.idx_cycled_graph]
        return damaged_topics
    
    def strange_dialogues_on_topic(self, topic: str, dia_idx: int):
        topic_id = self.topic2id[topic]
        dia_idx = [j for i, j in self.idx_duplicates if i == topic_id][dia_idx]
        dia_count = len([i for i, _ in self.idx_duplicates if i == topic_id])
        print('There are', dia_count, 'damaged dialogues')
        strange_edges = [(edge['source'], edge['utterances']) for edge in self.data[topic_id]['graph']['edges'] if edge['target'] == 1]
        dialogue = [mes['text'] for mes in self.data[topic_id]['dialogues'][dia_idx]['messages']]
        source_node = [node for node, utt in strange_edges if set(utt).intersection(set(dialogue))]
        return (self.data[topic_id]['graph']['nodes'][source_node[0]],
                self.data[topic_id]['dialogues'][dia_idx]['messages'])

In [8]:
dd = DuplicateDialogues(dataset['train'], idx_duplicates)

In [9]:
dd.get_duplicate_dialogue(0)

[{'participant': 'assistant', 'text': 'Hello! How can I assist you today?'},
 {'participant': 'user', 'text': 'I need help with chatbot limitations.'},
 {'participant': 'assistant',
  'text': "Sure, could you please specify the chatbot limitations you're facing?"},
 {'participant': 'user', 'text': 'Actually, I want to change my topic.'},
 {'participant': 'assistant',
  'text': 'Of course! What would you like to discuss instead?'},
 {'participant': 'user', 'text': 'I have another question.'},
 {'participant': 'assistant', 'text': 'Hello! How can I assist you today?'},
 {'participant': 'user',
  'text': 'I need assistance managing my review responses on Google/Yelp.'},
 {'participant': 'assistant',
  'text': 'Certainly! Are you looking to respond to positive or negative reviews?'},
 {'participant': 'user', 'text': 'Positive reviews.'},
 {'participant': 'assistant',
  'text': 'Acknowledge positive feedback and thank customers for their support to encourage loyalty.'},
 {'participant': 'us

In [10]:
len(dd.idx_cycled_graph)

25

In [11]:
imposter_trans = dd.find_imposter_transitions()

In [12]:
len(imposter_trans)

35

In [13]:
for trans1, trans2 in zip(imposter_trans[:17], imposter_trans[17:]):
    print(trans1, '\t\t\t\t\t', trans2)

I'd like to start over. 					 Actually, can I change the application?
I have another question. 					 Actually, can I change my request?
I'd like to choose Vimeo 					 Thanks for your help.
Got it, thanks. 					 I want to see running shoes instead.
Actually, I need help with something else. 					 Actually, I changed my mind
I need some time to decide. 					 I reconsidered, please keep my subscription.
Yes, please continue. 					 Yes, I changed my mind.
Actually, I want to change my earlier request. 					 Actually, I want to ask something else.
I changed my mind, I need more help. 					 I'd like to discuss something else.
Here is my updated name: Jane Smith. 					 Can we start over?
Actually, I need help with another eBook. 					 Actually, I changed my mind and don't need to cancel.
I need to make another reservation. 					 Thank you, that's all.
I'd like to wait a bit longer. 					 I want to send a new email.
Actually, I changed my mind. 					 I want to adjust the font size.
Actually, I

In [14]:
imposter_labels = dd.find_imposter_labels()

In [15]:
len(imposter_labels)

36

In [16]:
for label1, label2 in zip(imposter_labels[:17], imposter_labels[17:]):
    print(label1, '\t\t', label2)

data_transfer 		 modify_platform_selection
modify_action_prompt 		 manage_change_topic
cancel_after_moving 		 recovery_pane_not_visible
alternative_assistance 		 farewell
assistant_recovery_success 		 secure_account
restored 		 recovery_google_docs
alternative_options_exit 		 fees_calculation
restart 		 apologize_ask_details
information_type 		 protection_fee
modify_digital_asset_topic 		 confirmation_cancelled
assistant_exit 		 opt_protection
alternative_help 		 change_mind
guide_reporting 		 closing
AdditionalAssistance 		 assist_change_topic
complete_request 		 explain_overdraft
ask_product_type_mod 		 reduce_fees
modification_prompt 		 paraphrased_confirmation


In [17]:
dd.data[dd.idx_cycled_graph[8]]['dialogues'][11], dd.data[dd.idx_cycled_graph[8]]['graph']['nodes'][3]

({'id': 'Adjusting font size/contrast on apps._1_11',
  'messages': [{'participant': 'assistant',
    'text': 'Hello! How can I assist you with your app today?'},
   {'participant': 'user',
    'text': 'Actually, I need help with something else.'},
   {'participant': 'assistant',
    'text': 'Sure, feel free to ask any other questions you have.'},
   {'participant': 'user', 'text': 'I want to adjust the font size.'},
   {'participant': 'assistant',
    'text': 'Hello! How can I assist you with your app today?'}]},
 {'id': 4,
  'is_start': False,
  'label': 'alternative_help',
  'utterances': ['Sure, feel free to ask any other questions you have.']})

In [18]:
dd.data[dd.idx_cycled_graph[20]]['dialogues'][2], dd.data[dd.idx_cycled_graph[20]]['graph']['nodes'][5]

({'id': 'Recovering unsaved document data after a crash_1_2',
  'messages': [{'participant': 'assistant',
    'text': "Hello! I'm sorry to hear about your crash. Let's try to recover your unsaved document. Which application were you using?"},
   {'participant': 'user', 'text': 'I was using Microsoft Word.'},
   {'participant': 'assistant',
    'text': 'Microsoft Word has an AutoRecover feature. Would you like me to guide you through retrieving the unsaved document?'},
   {'participant': 'user', 'text': 'Yes, please guide me.'},
   {'participant': 'assistant',
    'text': 'Please open Microsoft Word, and you should see a Document Recovery pane on the left. Do you see it?'},
   {'participant': 'user', 'text': "No, I don't see it."},
   {'participant': 'assistant',
    'text': "If the Document Recovery pane isn't visible, would you like instructions on how to manually search for AutoRecover files?"},
   {'participant': 'user', 'text': 'Actually, can I change the application?'},
   {'parti

In [19]:
damaged_topics = dd.find_damaged_topics()

In [20]:
dd.strange_dialogues_on_topic(damaged_topics[2], 8)

There are 16 damaged dialogues


({'id': 11,
  'is_start': False,
  'label': 'cancel_after_moving',
  'utterances': ['Understood. Your subscription has been canceled. If you have any further questions, feel free to reach out.',
   'Your cancellation is complete. Let us know if you need anything else.']},
 [{'participant': 'assistant',
   'text': "Hello! I'm sorry to hear you want to cancel your subscription. Could you please share the reason for your cancellation?"},
  {'participant': 'user', 'text': "I'm moving to another provider."},
  {'participant': 'assistant',
   'text': 'I understand. Would you like assistance in transferring your data before cancelling?'},
  {'participant': 'user', 'text': 'Yes, please.'},
  {'participant': 'assistant',
   'text': 'Your data transfer is underway. Your subscription will be canceled soon.'},
  {'participant': 'user', 'text': 'Actually, I changed my mind.'},
  {'participant': 'assistant',
   'text': "Hi there! Sorry to see you're considering cancelling. Can you let us know why?"}

In [78]:
len(set([dd.data[i]['topic'] for i in range(402)]))

386

In [79]:
import json

with open('damaged_topics.json', 'w', encoding='utf-8') as file:
    json.dump(damaged_topics, file, ensure_ascii=False, indent=4)

In [21]:
damaged_topics[1]

'Assisting with chatbot limitations.Addressing fake social media accounts.Managing review responses on Google/Yelp.'