In [69]:
import json

path = "data/msc/msc/msc_dialogue"
mode = "test"

# Load data
data = []

for i in range(5):
    for ext in ["jsonl", "txt"]:
        try:
            with open(f"{path}/session_{i+1}/{mode}.{ext}", "r") as f:
                data.append([json.loads(line) for line in f])
                break
        except FileNotFoundError:
            pass


In [70]:
from tqdm import tqdm

# Sort by ids
for session in tqdm(data):
    session.sort(key=lambda x: int(x["metadata"]["initial_data_id"].split("_")[-1]))


100%|██████████| 5/5 [00:00<00:00, 3134.76it/s]


In [71]:
# Gather dialogues of different sessions

new_data = []

for dialogue in tqdm(data[0]):
    dialogues = [dialogue]

    # Find dialogue with same id in other sessions
    for session in data[1:]:
        # Search in session
        for d in session:
            if (
                d["metadata"]["initial_data_id"]
                == dialogue["metadata"]["initial_data_id"]
            ):
                dialogues.append(d)
                break
        else:
            dialogues.append(None)

    new_data.append(dialogues)


100%|██████████| 501/501 [00:00<00:00, 7889.27it/s]


In [72]:
from itertools import cycle

# Clean data
clean_data = []

for dialogues in tqdm(new_data):
    # Id
    clean_dialogues = {
        "id": int(dialogues[0]["metadata"]["initial_data_id"].split("_")[-1])
    }

    # Initial personas
    try:
        init_personas = dialogues[1]["init_personas"]
        clean_dialogues["init_personas"] = [
            {"speaker": "Speaker 1", "text": init_personas[0]},
            {"speaker": "Speaker 2", "text": init_personas[1]},
        ]
    except (IndexError, KeyError, TypeError) as e:
        clean_dialogues["init_personas"] = []

    # Dialogues
    clean_dialogues["sessions"] = []

    for i, dialogue in enumerate(dialogues):
        if dialogue is None:
            continue

        session_id = dialogue["metadata"]["session_id"]
        persona1, persona2 = dialogue["personas"]
        turns = [
            {"speaker": speaker, "text": turn["text"]}
            for speaker, turn in zip(
                cycle(["Speaker 1", "Speaker 2"]), dialogue["dialog"]
            )
        ]

        if "previous_dialogs" in dialogue:
            time_elapsed = dialogue["previous_dialogs"][0]["time_back"].rstrip(" ago")
        else:
            time_elapsed = "0"

        clean_dialogues["sessions"].append(
            {
                "session_id": session_id,
                "personas": [
                    {"speaker": "Speaker 1", "text": persona1},
                    {"speaker": "Speaker 2", "text": persona2},
                ],
                "dialogue": turns,
                "time_elapsed": time_elapsed,
            }
        )

    clean_data.append(clean_dialogues)


100%|██████████| 501/501 [00:00<00:00, 53155.58it/s]


In [73]:
import json

print(json.dumps(clean_data[2], indent=4))


{
    "id": 6,
    "init_personas": [
        {
            "speaker": "Speaker 1",
            "text": [
                "I'm a painter.",
                "I like making music.",
                "I don't want a boss.",
                "I try to love everyone.",
                "I live within my means."
            ]
        },
        {
            "speaker": "Speaker 2",
            "text": [
                "I like to eat fruit.",
                "I never break wind in public.",
                "I am the ancestor of an american civil war general.",
                "My father rarely smiles.",
                "I am an avid horse enthusiast."
            ]
        }
    ],
    "sessions": [
        {
            "session_id": 0,
            "personas": [
                {
                    "speaker": "Speaker 1",
                    "text": [
                        "I like to paint. I paint people playing music.",
                        "I hate my boss."
                    ]
     

In [74]:
# Save dataset as JSONL file
import json

with open(f"./data/processed/{mode}.jsonl", "w") as f:
    for sample in clean_data:
        f.write(json.dumps(sample) + "\n")
