In [9]:
DATA = "train"
DATATYPE = "train:ordered"


In [10]:
# Download files
from parlai.tasks.convai2.build import build as convai2_build
from parlai.tasks.msc.build import build as msc_build

convai2_build(opt={"datapath": "./data"})
msc_build(opt={"datapath": "./data"})


'./data/msc'

In [11]:
import copy
from parlai.core.teachers import FbDeprecatedDialogTeacher
from parlai.tasks.convai2.agents import _path
from parlai.tasks.msc.agents import Session1NormalizedTrait


class BothRevisedTeacher(FbDeprecatedDialogTeacher):
    def __init__(self, opt, shared=None):
        opt = copy.deepcopy(opt)
        try:
            cands = opt["task"].split(":")[2]
            use_cands = False if cands == "no_cands" else True
        except Exception:
            use_cands = True
        opt["datafile"] = _path(opt, "both_revised", use_cands)
        super().__init__(opt, shared)


class Session1BothTeacher(Session1NormalizedTrait, BothRevisedTeacher):
    pass


opt = {
    "max_num_turns": -1,
    "your_persona_first": True,
    "datapath": "./data",
    "datatype": f"{DATATYPE}",
    "task": "::no_cands",
}
data = Session1BothTeacher(opt)


16:37:30 | loading normalized fbdialog data: ./data/ConvAI2/valid_both_revised_no_cands.txt
16:37:30 | loading fbdialog data: ./data/ConvAI2/valid_both_revised_no_cands.txt


In [12]:
print(data.num_episodes())
print(data.num_examples())


1000
7801


In [13]:
from tqdm import tqdm

dataset = []
episode = []
it = iter(data)

for i in tqdm(range(data.num_examples())):
    sample = next(it)
    if sample["text"]:
        if "__silence__" in sample["text"]:
            break
        episode.append(sample["text"])
        try:
            episode.append(sample["labels"][0])
        except KeyError:
            episode.append(sample["eval_labels"][0])

    if sample["episode_done"] and episode:
        dataset.append(episode)
        episode = []


100%|██████████| 7801/7801 [00:00<00:00, 336716.63it/s]


In [14]:
print(len(dataset), sum(len(e) for e in dataset))
dataset[0]


1000 15602


["your persona: I've been know to finish almost two dozen novels in a twelve month period.\nyour persona: My part time gig has me doing some pretty brave things.\nyour persona: I have special dietary restrictions.\nyour persona: My mom and dad divorced when I was young.\npartner's persona: Bride of chucky is the best film out there.\npartner's persona: My wife works, so I take care of our children.\npartner's persona: I dad was an employee of a big diy store.\npartner's persona: For ten years I was employed in meeting peoples needs.\npartner's persona: My kid finished his elementary education last year.\nHello what are doing today?",
 'I am good, I just got off work and tired, I have two jobs.',
 'I just got done watching a horror movie',
 "I rather read, I've read about 20 books this year.",
 'Wow! I do love a good horror movie. Loving this cooler weather',
 'But a good movie is always good.',
 'Yes! My son is in junior high and I just started letting him watch them too',
 'I work in 

In [15]:
from itertools import cycle

new_dataset = []

for i, episode in tqdm(enumerate(dataset)):
    sample = {}
    sample["personas"] = [
        [
            line.strip("partner's persona: ")
            for line in episode[0].split("\n")
            if line.startswith("partner's persona:")
        ],
        [
            line.strip("your persona: ")
            for line in episode[0].split("\n")
            if line.startswith("your persona:")
        ],
    ]

    text = [episode[0].split("\n")[-1]] + episode[1:]
    sample["dialog"] = [
        {"text": t, "id": speaker, "convai2_id": f"{DATATYPE}_{i}"}
        for t, speaker in zip(text, cycle(["Speaker 1", "Speaker 2"]))
    ]

    sample["metadata"] = {"initial_data_id": f"{DATATYPE}_{i}", "session_id": 0}

    new_dataset.append(sample)


1000it [00:00, 20509.94it/s]


In [16]:
# Save dataset as JSONL file
import json

with open(f"./data/msc/msc/msc_dialogue/session_1/{DATA}.jsonl", "w") as f:
    for sample in new_dataset:
        f.write(json.dumps(sample) + "\n")
