In [1]:
import pandas as pd
import numpy as np
import os
import zipfile
import json
from collections import Counter
from transformers import BertTokenizer

# 数据处理

In [2]:
def read_zipped_json(filepath):
    with zipfile.ZipFile(filepath) as myzip:
        for filename in myzip.namelist():
            with myzip.open(filename) as myfile:
                return json.load(myfile)


def preprocess():
    processed_data_dir = './data/joint'
    os.makedirs(processed_data_dir, exist_ok=True)
    all_intent = []
    all_tag = []
    context_size = 3
    tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
    for key in ('train', 'val', 'test'):
        data = read_zipped_json(f'./data/raw/{key}.json.zip')
        processed_data = []

        for item in data.values():
            context = []
            for i, turn in enumerate(item['messages']):
                utterance = turn["content"]
                tokens = tokenizer.tokenize(utterance)
                tags = ['O'] * len(tokens)
                intents = []
                golden = []
                for intent, domain, slot, value in turn['dialog_act']:
                    if intent in ['Inform', 'Recommend'] and '酒店设施' not in slot:
                        if value in utterance:
                            idx = utterance.index(value)
                            start = len(tokenizer.tokenize(utterance[:idx]))
                            end = idx + len(tokenizer.tokenize(value))
                            tag = '+'.join((intent, domain, slot))
                            tags[start: end] = ['B+' + tag] + \
                                ['I+' + tag] * (end - start - 1)
                            token_v = ''.join(
                                tokens[start: end]).replace('##', '')
                            golden.append([intent, domain, slot, token_v])
                        else:
                            golden.append([intent, domain, slot, value])
                    else:
                        intents.append('+'.join([intent, domain, slot, value]))
                        golden.append([intent, domain, slot, value])
                processed_data.append(
                    [tokens, tags, intents, golden, context[-context_size:]])
                all_intent.extend(intents)
                all_tag.extend(tags)
                context.append(turn['content'])

        all_intent = [x[0] for x in Counter(all_intent).items()]
        all_tag = [x[0] for x in Counter(all_tag).items()]
        print("loaded {}, size {}".format(key, len(processed_data)))
        with open(os.path.join(processed_data_dir, f'{key}_data.json'), 'w') as f:
            json.dump(processed_data, f, ensure_ascii=False, indent=2)
    print("sentence label num:", len(all_intent))
    print("tag num:", len(all_tag))
    print(all_intent)
    with open(os.path.join(processed_data_dir, 'intent_vocab.json'), 'w') as f1:
        json.dump(all_intent, f1, ensure_ascii=False, indent=2)
    with open(os.path.join(processed_data_dir, 'tag_vocab.json'), 'w') as f2:
        json.dump(all_tag, f2, ensure_ascii=False, indent=2)

In [8]:
preprocess()

loaded train, size 84692
loaded val, size 8458
loaded test, size 8476
sentence label num: 158
tag num: 77
['General+greet+none+none', 'Request+景点+名称+', 'Request+景点+地址+', 'Request+景点+游玩时间+', 'Request+餐馆+营业时间+', 'Request+餐馆+评分+', 'Request+酒店+名称+', 'Request+酒店+周边景点+', 'General+thank+none+none', 'Inform+酒店+酒店设施-叫醒服务+是', 'Request+酒店+电话+', 'Request+景点+电话+', 'Request+餐馆+名称+', 'Request+餐馆+周边餐馆+', 'Request+餐馆+电话+', 'General+welcome+none+none', 'NoOffer+餐馆+none+none', 'Request+餐馆+地址+', 'Inform+酒店+酒店设施-无烟房+是', 'Request+酒店+地址+', 'Request+景点+周边景点+', 'Request+酒店+酒店类型+', 'NoOffer+景点+none+none', 'Request+景点+周边酒店+', 'Request+景点+周边餐馆+', 'Request+景点+门票+', 'Request+酒店+周边餐馆+', 'Request+出租+车型+', 'Request+出租+车牌+', 'Request+餐馆+周边景点+', 'Request+餐馆+周边酒店+', 'Request+地铁+出发地附近地铁站+', 'Request+地铁+目的地附近地铁站+', 'Request+景点+评分+', 'Select+餐馆+源领域+景点', 'Inform+酒店+酒店设施-商务中心+是', 'Inform+酒店+酒店设施-中式餐厅+是', 'Inform+酒店+酒店设施-接站服务+是', 'Request+酒店+价格+', 'General+bye+none+none', 'Request+酒店+酒店设施-叫醒服务+', 'Inform+酒店+酒店设施-叫醒服务+否', 'Info

# 