In [4]:
import json
from pathlib import Path
from typing import List, Tuple
import json

def read_MixSNIPs_file(filePath) -> List[Tuple[str, List[str]]]:
    """read MixSNIPs file into a list of tuple, whose element is: sentence, [label]

    Return
    -------
    list : List[Tuple[str, List[str]]]
        A list of tuple, whose are: a sentence and a list of labels
    """
    texts, slots, intents = [], [], []
    text, slot = [], []
    with open(filePath, 'r', encoding="utf8") as fr:
        for line in fr.readlines():
            items = line.strip().split()
            if len(items) == 1:
                texts.append(text)
                slots.append(slot)
                if "/" not in items[0]:
                    intents.append(items)
                else:
                    new = items[0].split("/")
                    intents.append([new[1]])
                # clear buffer lists.
                text, slot = [], []
            elif len(items) == 2:
                text.append(items[0].strip())
                slot.append(items[1].strip())
    sentences = []
    labels = []
    space = ' '
    for i, txt in enumerate(texts):
        sentences.append(space.join(txt))
        
        intent_instance = intents[i][0]
        if '#' in intent_instance:
            label = intent_instance.split('#')
        else:
            label = [intent_instance]
        labels.append(label)
    return list(zip(sentences, labels))

def read_CLINC150_file(filePath):
    """Return a dictionary from json file, where each keys contain a list.
    """
    with open(filePath, 'r') as f:
        dataDict = json.load(f)
    print('The keys found in this json are: ', dataDict.keys())
    return dataDict

def get_label_set(*lists):
    """Return a List[str] of label, used to map 'label' into indices.
    """
    labelSet = []
    for currentList in lists:
        for sentence, labels in currentList:
            for label in labels:
                if label not in labelSet:
                    labelSet.append(label)
    return labelSet

def turn_single_label_to_multilabels(*lists):
    for l in lists:
        for element in l:
            element[1] = [element[1]]


In [5]:
dataDict = read_CLINC150_file('data_oos_plus.json')
trainList = dataDict['train'] + dataDict['oos_train']
valList = dataDict['val'] + dataDict['oos_val']
testList = dataDict['test'] + dataDict['oos_test']
turn_single_label_to_multilabels(trainList, valList, testList)

The keys found in this json are:  dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])


In [10]:
with open('train.json', 'w') as f:
    json.dump(trainList, f, indent=4)
with open('val.json', 'w') as f:
    json.dump(valList, f, indent=4)
with open('test.json', 'w') as f:
    json.dump(testList, f, indent=4)