In [1]:
import json

In [2]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

In [3]:
import functools

@functools.lru_cache(2048)
def create_paraphrases(input_sentence, num):
    model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
    device = torch.device("cuda")
    model = model.to(device)
    tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
    batch = tokenizer(input_sentence, return_tensors='pt')
    generated_ids = model.generate(batch['input_ids'].to("cuda"), num_beams=num, num_return_sequences=num)
    generated_sentences = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_sentences

In [4]:
create_paraphrases("You are a bot designed to help schedule meetings on a calendar.", num=5)

['You are a bot designed to help schedule meetings on a calendar.',
 'You are a bot that is designed to help schedule meetings on a calendar.',
 'You are a bot designed to schedule meetings on a calendar.',
 'You are a bot designed to help set up meetings on a calendar.',
 'You are a bot designed to help schedule meeting on a calendar.']

In [5]:
tasks = [json.loads(item) for item in open('../data/METALWOZ/train/tasks.txt', encoding='utf8').readlines()]
tasks.extend([json.loads(item) for item in open('../data/METALWOZ/test/tasks.txt', encoding='utf8').readlines()])
tasks.extend([json.loads(item) for item in open('../data/METALWOZ/test/task_multiwoz.txt', encoding='utf8').readlines()])

In [6]:
tasks

[{'task_id': '4a06139e',
  'domain': 'UPDATE_CALENDAR',
  'bot_prompt': "Schedule the user's meeting request",
  'bot_role': 'You are a bot designed to help schedule meetings on a calendar. ',
  'user_prompt': ' You have a meeting saved for March 24th. Ask the chatbot to delete the meeting',
  'user_role': 'You are interacting with a meeting scheduling bot'},
 {'task_id': '8557b6a5',
  'domain': 'UPDATE_CALENDAR',
  'bot_prompt': "Before fulfilling the user's request, ask them to clarify which meeting they are talking about.",
  'bot_role': 'You are a bot designed to help schedule meetings on a calendar. ',
  'user_prompt': ' You have a meeting saved for March 24th. Ask the chatbot to move the meeting to March 30th',
  'user_role': 'You are interacting with a meeting scheduling bot'},
 {'task_id': '663021ef',
  'domain': 'UPDATE_CALENDAR',
  'bot_prompt': " Inform the user that the new meeting they're trying to schedule conflicts with an existing meeting",
  'bot_role': 'You are a bot 

In [7]:
def load_dialogue_as_dict(filename):
    dialogue_list = [json.loads(item) for item in open(filename).readlines()]
    dialogue_dict = {}
    for item in dialogue_list:
        task_id = item["task_id"]
        sentence = item["turns"][1]
        dialogue_dict.setdefault(task_id, [])
        dialogue_dict[task_id].append(sentence)
    return dialogue_dict

In [8]:
import os
path = "../data/METALWOZ/train/dialogues/"
dialogues = [{"domain": item.replace('.txt', ''),
                   "path": os.path.join(path, item),
                   "triggers": load_dialogue_as_dict(os.path.join(path, item))
                  }
                  for item in os.listdir(path)
                  if '.txt' in item and 'task' not in item]

path = "../data/METALWOZ/test/dialogues/"
dialogues.extend([{"domain": item.replace('.txt', ''),
                        "path": os.path.join(path, item),
                        "triggers": load_dialogue_as_dict(os.path.join(path, item))
                       }
                      for item in os.listdir(path)
                      if '.txt' in item and 'task' not in item])

In [9]:
import random

from tqdm import tqdm

random.seed(42)

dataset = []

for item in tqdm(tasks):
    task_type = item["domain"]
    task_id = item["task_id"]
    bot_role = item["bot_role"]
    paraphrases = create_paraphrases(bot_role, num=10)
    for dialogue in dialogues:
        if dialogue["domain"] != task_type:
            continue
        
        for trigger_id, sentences in dialogue["triggers"].items():
            if task_id != trigger_id:
                continue
            
            for sentence in sentences:
                role_text = random.choice(paraphrases)
                if "hello" in sentence.lower() and len(sentence) < len("hello") + 5:
                    continue
                    
                if len(sentence) < 4:
                    continue
                    
                dataset.append({
                    "sentence": sentence,
                    "role": role_text,
                })

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 248/248 [07:50<00:00,  1.90s/it]


In [16]:
[item for item in dataset if "how are you" in item["sentence"].lower()]

[{'sentence': 'Hey, how are you, did you know I can communicate telepathically?',
  'role': 'You are a bot designed to agree with everything the user says,'},
 {'sentence': 'How are you?',
  'role': 'You are a bot designed to agree with everything the user says'},
 {'sentence': 'hello sorry My internet is very slow. How are you?',
  'role': 'You are a bot that gives information about time zones.'},
 {'sentence': 'how are you', 'role': 'You are a bot that sort things out.'},
 {'sentence': 'hello how are you too',
  'role': 'You are a bot designed to provide updated information about world records.'},
 {'sentence': 'Hi How are you?',
  'role': 'You are a bot that helps schedule shows during a theater festival.'},
 {'sentence': 'Hi , how are you ?   Can you help me find an expensive place to stay that includes free wifi ?',
  'role': 'How do I ask someone to help me?'},
 {'sentence': 'hello, how are you',
  'role': 'You are a bot that greets the user'}]

In [17]:
dataset.append({
    "sentence" : "Hi",
    "role" : "You are a bot that greets the user",
})

dataset.append({
    "sentence" : "Hello",
    "role" : "You are a bot that greets the user",
})

dataset.append({
    "sentence" : "hello, how are you",
    "role" : "You are a bot that greets the user",
})

dataset.append({
    "sentence" : "Hi there!",
    "role" : "You are a bot that greets the user",
})

dataset.append({
    "sentence" : "Nice to meet you",
    "role" : "You are a bot that greets the user",
})

In [18]:
dataset.append({
    "sentence" : "add this item to the list",
    "role" : "You are a bot that manages lists for the user",
})

dataset.append({
    "sentence" : "delete bananas from the list",
    "role" : "You are a bot that manages lists for the user",
})

dataset.append({
    "sentence" : "what is in the list?",
    "role" : "You are a bot that manages lists for the user",
})

dataset.append({
    "sentence" : "Please list the items to shop",
    "role" : "You are a bot that manages lists for the user",
})

dataset.append({
    "sentence" : "What my list of tasks",
    "role" : "You are a bot that manages lists for the user",
})

In [19]:
len(dataset)

42066

In [20]:
def create_text(sentence, role):
    return f"""
The following utterance:
{sentence}

defines this type of role:
{role}
    """.strip()

In [21]:
texts = [
    create_text(item["sentence"], item["role"])
    for item in dataset
]

In [22]:
with open('../data/roles.json', 'w') as f:
    json.dump(texts, f)