In [1]:
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from collections import Counter
from tqdm import tqdm as tqdm
from itertools import zip_longest
from copy import deepcopy
from collections import Counter
from os.path import join
from codecs import open
from tqdm import tqdm_notebook as tqdm

In [2]:
def FormatTestFile(file_path,outdata_path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type
    with open(file_path, 'r', encoding='utf8') as f:
        file_text=f.read().encode('utf-8').decode('utf-8-sig')
    datas=file_text.split('\n\n--------------------\n\n')[:-1]

    output = ""
    for i in tqdm(range(len(datas))):
        data=datas[i].split('\n')
        trainingset=data[1]
        for j in range(len(trainingset)):
            if(trainingset[j] == ' 'or trainingset[j] == '　'):
                output += "_"+" O\n"
            else:
                output += trainingset[j]+" O\n"
            
        output += "\n"
   

    with open(outdata_path, 'w', encoding='utf-8') as f:
        f.write(output)

In [3]:
def build_corpus(split, make_vocab=True, data_dir="./Data"):
    assert split in ['train', 'dev', 'test']
    word_lists = []
    tag_lists = []
    with open(join(data_dir, split+".char.bmes"), 'r', encoding='utf-8') as f:
        word_list = []
        tag_list = []
        i = 0
        for line in tqdm(f):
            i += 1
            if line != '\r\n' and line != '\n':
                word, tag = line.strip('\r\n').split()
                word_list.append(word)
                tag_list.append(tag)

            else:
                word_lists.append(word_list)
                tag_lists.append(tag_list)
                word_list = []
                tag_list = []

    if make_vocab:
        word2id = build_map(word_lists)
        tag2id = build_map(tag_lists)
        return word_lists, tag_lists, word2id, tag2id
    else:
        return word_lists, tag_lists

In [4]:
def extend_maps(word2id, tag2id, for_crf=True):
    word2id['<unk>'] = len(word2id)
    word2id['<pad>'] = len(word2id)
    tag2id['<unk>'] = len(tag2id)
    tag2id['<pad>'] = len(tag2id)
    
    if for_crf:
        word2id['<start>'] = len(word2id)
        word2id['<end>'] = len(word2id)
        tag2id['<start>'] = len(tag2id)
        tag2id['<end>'] = len(tag2id)

    return word2id, tag2id

In [5]:
def prepocess_data_for_lstmcrf(word_lists, tag_lists, test=False):
    assert len(word_lists) == len(tag_lists)
    for i in range(len(word_lists)):
        word_lists[i].append("<end>")
        if not test:  # 如果是测试数据，就不需要加end token了
            tag_lists[i].append("<end>")

    return word_lists, tag_lists

In [6]:
def cal_lstm_crf_loss(crf_scores, targets, tag2id):

    pad_id = tag2id.get('<pad>')
    start_id = tag2id.get('<start>')
    end_id = tag2id.get('<end>')

    device = crf_scores.device
    batch_size, max_len = targets.size()
    target_size = len(tag2id)

    mask = (targets != pad_id)
    lengths = mask.sum(dim=1)
    targets = indexed(targets, target_size, start_id)

    targets = targets.masked_select(mask) 

    flatten_scores = crf_scores.masked_select(
        mask.view(batch_size, max_len, 1, 1).expand_as(crf_scores)
    ).view(-1, target_size*target_size).contiguous()

    golden_scores = flatten_scores.gather(
        dim=1, index=targets.unsqueeze(1)).sum()

    scores_upto_t = torch.zeros(batch_size, target_size).to(device)
    for t in range(max_len):
        batch_size_t = (lengths > t).sum().item()
        if t == 0:
            scores_upto_t[:batch_size_t] = crf_scores[:batch_size_t,
                                                      t, start_id, :]
        else:
            scores_upto_t[:batch_size_t] = torch.logsumexp(
                crf_scores[:batch_size_t, t, :, :] +
                scores_upto_t[:batch_size_t].unsqueeze(2),
                dim=1
            )
    all_path_scores = scores_upto_t[:, end_id].sum()
    loss = (all_path_scores - golden_scores) / batch_size
    return loss


In [7]:
def tensorized(batch, maps):
    PAD = maps.get('<pad>')
    UNK = maps.get('<unk>')

    max_len = len(batch[0])
    batch_size = len(batch)

    batch_tensor = torch.ones(batch_size, max_len).long() * PAD
    for i, l in enumerate(batch):
        for j, e in enumerate(l):
            batch_tensor[i][j] = maps.get(e, UNK)
    lengths = [len(l) for l in batch]

    return batch_tensor, lengths

In [8]:
def sort_by_lengths(word_lists, tag_lists):
    pairs = list(zip(word_lists, tag_lists))
    indices = sorted(range(len(pairs)),
                     key=lambda k: len(pairs[k][0]),
                     reverse=True)
    pairs = [pairs[i] for i in indices]
    word_lists, tag_lists = list(zip(*pairs))
    return word_lists, tag_lists, indices

In [9]:
def build_map(lists):
    maps = {}
    for list_ in lists:
        for e in list_:
            if e not in maps:
                maps[e] = len(maps)

    return maps

In [10]:
def load_model(file_name):
    with open(file_name, "rb") as f:
        input_model = pickle.load(f)
    return input_model

In [11]:
class BILSTM_Model(object):
    def __init__(self, vocab_size, out_size, crf=True):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.emb_size = LSTMConfig.emb_size
        self.hidden_size = LSTMConfig.hidden_size

        self.crf = crf

        if not crf:
            self.model = BiLSTM(vocab_size, self.emb_size, self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_loss
        else:
            self.model = BiLSTM_CRF(vocab_size, self.emb_size,self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_lstm_crf_loss

        self.epoches = TrainingConfig.epoches
        self.print_step = TrainingConfig.print_step
        self.lr = TrainingConfig.lr
        self.batch_size = TrainingConfig.batch_size
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.step = 0
        self._best_val_loss = 1e18
        self.best_model = None
        self.model_temp[100] = None        
        

    def train(self, word_lists, tag_lists,
              dev_word_lists, dev_tag_lists,
              word2id, tag2id):

        word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(
            dev_word_lists, dev_tag_lists)

        B = self.batch_size
        for e in (range(1, self.epoches+1)):
            self.step = 0
            losses = 0.
            for ind in tqdm(range(0, len(word_lists), B)):
                batch_sents = word_lists[ind:ind+B]
                batch_tags = tag_lists[ind:ind+B]

                losses += self.train_step(batch_sents,
                                          batch_tags, word2id, tag2id)

                if self.step % TrainingConfig.print_step == 0:
                    total_step = (len(word_lists) // B + 1)

                    losses = 0.

            val_loss = self.validate(dev_word_lists, dev_tag_lists, word2id, tag2id)
            print("Epoch {}/{}, Val Loss:{:.4f}".format(e, self.epoches, val_loss))

    def train_step(self, batch_sents, batch_tags, word2id, tag2id):
        self.model.train()
        self.step += 1

        tensorized_sents, lengths = tensorized(batch_sents, word2id)
        tensorized_sents = tensorized_sents.to(self.device)
        targets, lengths = tensorized(batch_tags, tag2id)
        targets = targets.to(self.device)

        scores = self.model(tensorized_sents, lengths)

        self.optimizer.zero_grad()
        loss = self.cal_loss_func(scores, targets, tag2id).to(self.device)
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):
        self.model.eval()
        with torch.no_grad():
            val_losses = 0.
            val_step = 0
            for ind in range(0, len(dev_word_lists), self.batch_size):
                val_step += 1

                batch_sents = dev_word_lists[ind:ind+self.batch_size]
                batch_tags = dev_tag_lists[ind:ind+self.batch_size]
                tensorized_sents, lengths = tensorized(
                    batch_sents, word2id)
                tensorized_sents = tensorized_sents.to(self.device)
                targets, lengths = tensorized(batch_tags, tag2id)
                targets = targets.to(self.device)

                scores = self.model(tensorized_sents, lengths)

                loss = self.cal_loss_func(
                    scores, targets, tag2id).to(self.device)
                val_losses += loss.item()
            val_loss = val_losses / val_step

            if val_loss < self._best_val_loss:
                print("Keep the Best...")
                self.best_model = deepcopy(self.model)
                self._best_val_loss = val_loss

            return val_loss

    def test(self, word_lists, tag_lists, word2id, tag2id):
        word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
        tensorized_sents, lengths = tensorized(word_lists, word2id)
        tensorized_sents = tensorized_sents.to(self.device)

        self.best_model.eval()
#         self.model.eval()
        with torch.no_grad():
            batch_tagids = self.best_model.test(
                tensorized_sents, lengths, tag2id)

        pred_tag_lists = []
        id2tag = dict((id_, tag) for tag, id_ in tag2id.items())
        for i, ids in enumerate(batch_tagids):
            tag_list = []
            if self.crf:
                for j in range(lengths[i] - 1): 
                    tag_list.append(id2tag[ids[j].item()])
            else:
                for j in range(lengths[i]):
                    tag_list.append(id2tag[ids[j].item()])
            pred_tag_lists.append(tag_list)

        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
        indices, _ = list(zip(*ind_maps))
        pred_tag_lists = [pred_tag_lists[i] for i in indices]
        tag_lists = [tag_lists[i] for i in indices]

        return pred_tag_lists, tag_lists


class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):

        super(BiLSTM_CRF, self).__init__()
        self.bilstm = BiLSTM(vocab_size, emb_size, hidden_size, out_size)
        self.transition = nn.Parameter(
            torch.ones(out_size, out_size) * 1/out_size)

    def forward(self, sents_tensor, lengths):
        emission = self.bilstm(sents_tensor, lengths)
        batch_size, max_len, out_size = emission.size()
        crf_scores = emission.unsqueeze(
            2).expand(-1, -1, out_size, -1) + self.transition.unsqueeze(0)

        return crf_scores

    def test(self, test_sents_tensor, lengths, tag2id):
        start_id = tag2id['<start>']
        end_id = tag2id['<end>']
        pad = tag2id['<pad>']
        tagset_size = len(tag2id)

        crf_scores = self.forward(test_sents_tensor, lengths)
        device = crf_scores.device
        B, L, T, _ = crf_scores.size()
        viterbi = torch.zeros(B, L, T).to(device)
        backpointer = (torch.zeros(B, L, T).long() * end_id).to(device)
        lengths = torch.LongTensor(lengths).to(device)
        for step in range(L):
            batch_size_t = (lengths > step).sum().item()
            if step == 0:
                viterbi[:batch_size_t, step,
                        :] = crf_scores[: batch_size_t, step, start_id, :]
                backpointer[: batch_size_t, step, :] = start_id
            else:
                max_scores, prev_tags = torch.max(
                    viterbi[:batch_size_t, step-1, :].unsqueeze(2) +
                    crf_scores[:batch_size_t, step, :, :],  
                    dim=1
                )
                viterbi[:batch_size_t, step, :] = max_scores
                backpointer[:batch_size_t, step, :] = prev_tags

        backpointer = backpointer.view(B, -1)
        tagids = [] 
        tags_t = None
        for step in range(L-1, 0, -1):
            batch_size_t = (lengths > step).sum().item()
            if step == L-1:
                index = torch.ones(batch_size_t).long() * (step * tagset_size)
                index = index.to(device)
                index += end_id
            else:
                prev_batch_size_t = len(tags_t)

                new_in_batch = torch.LongTensor(
                    [end_id] * (batch_size_t - prev_batch_size_t)).to(device)
                offset = torch.cat(
                    [tags_t, new_in_batch],
                    dim=0
                ) 
                index = torch.ones(batch_size_t).long() * (step * tagset_size)
                index = index.to(device)
                index += offset.long()

            try:
                tags_t = backpointer[:batch_size_t].gather(
                    dim=1, index=index.unsqueeze(1).long())
            except RuntimeError:
                import pdb
                pdb.set_trace()
            tags_t = tags_t.squeeze(1)
            tagids.append(tags_t.tolist())

        tagids = list(zip_longest(*reversed(tagids), fillvalue=pad))
        tagids = torch.Tensor(tagids).long()

        return tagids


In [12]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):

        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)

        self.lin = nn.Linear(2*hidden_size, out_size)

    def forward(self, sents_tensor, lengths):
        emb = self.embedding(sents_tensor)  # [B, L, emb_size]

        packed = pack_padded_sequence(emb, lengths, batch_first=True)
        rnn_out, _ = self.bilstm(packed)
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)

        scores = self.lin(rnn_out)  # [B, L, out_size]

        return scores

    def test(self, sents_tensor, lengths, _):
        logits = self.forward(sents_tensor, lengths)  # [B, L, out_size]
        _, batch_tagids = torch.max(logits, dim=2)

        return batch_tagids

In [13]:
# inputdata_path = f"test.txt"       #要轉換的檔案
# outdata_path = f'./Data/test.char.bmes'   #轉換輸出的檔案

# FormatTestFile(inputdata_path,outdata_path)

In [14]:
word2id = {'醫': 0, '師': 1, '：': 2, '啊': 3, '回': 4, '去': 5, '還': 6, '好': 7, '嗎': 8, '？': 9, '民': 10, '眾': 11, '欸': 12, '，': 13, '是': 14, '虛': 15, '的': 16, '但': 17, '。': 18, '真': 19, '險': 20, '坦': 21, '白': 22, '講': 23, '我': 24, '剛': 25, '時': 26, '候': 27, '晚': 28, '上': 29, '有': 30, '盜': 31, '汗': 32, '阿': 33, '只': 34, '前': 35, '天': 36, '很': 37, '多': 38, '就': 39, '算': 40, '沒': 41, '可': 42, '一': 43, '覺': 44, '到': 45, '明': 46, '這': 47, '樣': 48, '齁': 49, '給': 50, '你': 51, '看': 52, '電': 53, '腦': 54, '斷': 55, '層': 56, '嘿': 57, '那': 58, '個': 59, '病': 60, '毒': 61, '報': 62, '告': 63, '不': 64, '知': 65, '道': 66, '出': 67, '來': 68, '對': 69, '它': 70, '幫': 71, '驗': 72, '了': 73, '少': 74, '自': 75, '體': 76, '免': 77, '疫': 78, '呢': 79, '喔': 80, '相': 81, '信': 82, '之': 83, '都': 84, '過': 85, '哇': 86, '工': 87, '作': 88, '壓': 89, '力': 90, '大': 91, '得': 92, '潰': 93, '瘍': 94, 'n': 95, 'e': 96, 'g': 97, 'a': 98, 't': 99, 'i': 100, 'v': 101, '皰': 102, '疹': 103, '抗': 104, '也': 105, '嗯': 106, 'Q': 107, '熱': 108, '陰': 109, '性': 110, '然': 111, '後': 112, '第': 113, '次': 114, '檢': 115, '查': 116, '綜': 117, '合': 118, '結': 119, '果': 120, '所': 121, '謂': 122, '種': 123, '蚊': 124, '蟲': 125, '叮': 126, '咬': 127, '跟': 128, '動': 129, '物': 130, '、': 131, '跳': 132, '蚤': 133, '些': 134, '比': 135, '較': 136, '關': 137, '解': 138, '恙': 139, '需': 140, '再': 141, '採': 142, '係': 143, '們': 144, '要': 145, '追': 146, '蹤': 147, '起': 148, '送': 149, '疾': 150, '管': 151, '區': 152, '以': 153, '機': 154, '會': 155, '因': 156, '為': 157, '在': 158, '離': 159, '島': 160, '山': 161, '斑': 162, '傷': 163, '寒': 164, '外': 165, '啦': 166, '見': 167, 'E': 168, 'B': 169, 'V': 170, 'C': 171, 'M': 172, 'O': 173, 'K': 174, '什': 175, '麼': 176, '東': 177, '西': 178, '引': 179, '肝': 180, '功': 181, '能': 182, '異': 183, '常': 184, '巨': 185, '細': 186, '胞': 187, '良': 188, '己': 189, '讓': 190, '發': 191, '燒': 192, '陣': 193, '子': 194, '直': 195, '接': 196, '調': 197, '炎': 198, '指': 199, '數': 200, '高': 201, 'R': 202, 'P': 203, '8': 204, '5': 205, '1': 206, '0': 207, '2': 208, '睡': 209, '喝': 210, '酒': 211, '應': 212, '酬': 213, '其': 214, '實': 215, '說': 216, '假': 217, '如': 218, '年': 219, '老': 220, '岸': 221, 'G': 222, 'o': 223, 'l': 224, '設': 225, '廠': 226, '客': 227, '戶': 228, '邊': 229, '做': 230, '跑': 231, '業': 232, '務': 233, 'A': 234, 'S': 235, '克': 236, '人': 237, '確': 238, '照': 239, '話': 240, '三': 241, '月': 242, '十': 243, '八': 244, '住': 245, '院': 246, '腸': 247, '點': 248, '壁': 249, '厚': 250, '正': 251, '該': 252, '像': 253, '蛋': 254, '殼': 255, '地': 256, '方': 257, '內': 258, '視': 259, '鏡': 260, '切': 261, '片': 262, '許': 263, '感': 264, '染': 265, '開': 266, '始': 267, '另': 268, '主': 269, '脈': 270, '嘛': 271, '條': 272, '旁': 273, '顆': 274, '淋': 275, '巴': 276, '怎': 277, '腫': 278, '理': 279, '論': 280, '等': 281, '於': 282, '哨': 283, '意': 284, '思': 285, '站': 286, '局': 287, '部': 288, '菌': 289, '或': 290, '周': 291, '遭': 292, '浪': 293, '誰': 294, '裡': 295, '面': 296, '兵': 297, '哥': 298, '球': 299, '奇': 300, '血': 301, '增': 302, '加': 303, '特': 304, '別': 305, '蠻': 306, '怪': 307, '通': 308, '衝': 309, '必': 310, '須': 311, '放': 312, '射': 313, '科': 314, '他': 315, '叫': 316, '反': 317, 'r': 318, 'c': 319, '消': 320, '下': 321, '定': 322, '退': 323, '惡': 324, '持': 325, '續': 326, '甚': 327, '至': 328, '變': 329, '而': 330, '且': 331, '規': 332, '則': 333, '圓': 334, '滾': 335, '長': 336, '當': 337, '最': 338, '準': 339, '深': 340, '膛': 341, '剖': 342, '肚': 343, '想': 344, '計': 345, '劃': 346, '降': 347, '段': 348, '間': 349, '兩': 350, '問': 351, '題': 352, '取': 353, '式': 354, '賈': 355, '伯': 356, '斯': 357, '門': 358, '診': 359, '禮': 360, '拜': 361, '抽': 362, '二': 363, '現': 364, '生': 365, '素': 366, '吃': 367, '預': 368, '防': 369, '綠': 370, '色': 371, '黃': 372, '先': 373, '停': 374, '橘': 375, '完': 376, '串': 377, '把': 378, '般': 379, '聽': 380, '付': 381, '難': 382, '搞': 383, '罕': 384, '掉': 385, '液': 386, '培': 387, '養': 388, '保': 389, '留': 390, '今': 391, '請': 392, '繼': 393, '班': 394, '太': 395, '累': 396, '畢': 397, '竟': 398, '居': 399, '美': 400, '國': 401, '買': 402, '房': 403, '紐': 404, '約': 405, '新': 406, '社': 407, '錯': 408, '圖': 409, '書': 410, '館': 411, '總': 412, '婚': 413, '情': 414, '路': 415, '順': 416, '遂': 417, '身': 418, '顧': 419, '據': 420, '普': 421, '拿': 422, '疼': 423, '稍': 424, '微': 425, '備': 426, '著': 427, '藥': 428, '本': 429, '用': 430, '挑': 431, '簡': 432, '單': 433, '狀': 434, '況': 435, '七': 436, '號': 437, '四': 438, '九': 439, '呀': 440, '胃': 441, '快': 442, '吧': 443, '便': 444, '幽': 445, '螺': 446, '旋': 447, '桿': 448, '純': 449, '粹': 450, '諾': 451, '羅': 452, '又': 453, '久': 454, '概': 455, '五': 456, '分': 457, '吐': 458, '瀉': 459, '嚴': 460, '重': 461, '剩': 462, '猜': 463, '哪': 464, '拖': 465, '誤': 466, '打': 467, '撞': 468, 'T': 469, '差': 470, '幹': 471, '才': 472, '中': 473, '膈': 474, '腔': 475, '刀': 476, '進': 477, '心': 478, '包': 479, '膜': 480, '肌': 481, '質': 482, '敏': 483, '哈': 484, '謝': 485, '從': 486, '走': 487, '學': 488, '弟': 489, '百': 490, '塊': 491, '車': 492, '馬': 493, '費': 494, '嬤': 495, '她': 496, '家': 497, '屬': 498, '誒': 499, '行': 500, '午': 501, '昨': 502, '半': 503, '夜': 504, '3': 505, '.': 506, '尿': 507, '恩': 508, '哼': 509, '雷': 510, '氏': 511, '肺': 512, '名': 513, '字': 514, '已': 515, '針': 516, '效': 517, '注': 518, '日': 519, '辦': 520, '法': 521, '平': 522, '營': 523, '…': 524, '突': 525, '寫': 526, '轉': 527, '全': 528, '～': 529, '慶': 530, '鴻': 531, '祝': 532, '江': 533, '鳥': 534, '健': 535, '註': 536, '冊': 537, '肉': 538, '埋': 539, '附': 540, '頭': 541, '呵': 542, '同': 543, '張': 544, '處': 545, '食': 546, '鹽': 547, '水': 548, '六': 549, '核': 550, '亂': 551, '昏': 552, '恍': 553, '神': 554, '例': 555, '低': 556, '幾': 557, '輪': 558, '整': 559, '護': 560, '7': 561, '4': 562, '眼': 563, '印': 564, '象': 565, '清': 566, '楚': 567, '入': 568, '找': 569, '緊': 570, '歷': 571, '福': 572, '療': 573, '小': 574, '穩': 575, '近': 576, '口': 577, '辛': 578, '苦': 579, '絕': 580, '症': 581, '餒': 582, '被': 583, '配': 584, '申': 585, '份': 586, '支': 587, '皮': 588, '安': 589, '怕': 590, '念': 591, '教': 592, '泰': 593, '9': 594, '事': 595, '舒': 596, '服': 597, '底': 598, '副': 599, 'k': 600, '腰': 601, '痛': 602, '脫': 603, '帶': 604, '坐': 605, '膨': 606, '瘦': 607, '骨': 608, '翻': 609, '麻': 610, '抖': 611, '萎': 612, '縮': 613, '衣': 614, '右': 615, '腿': 616, '側': 617, '隻': 618, '手': 619, '影': 620, '響': 621, '髖': 622, '黑': 623, '左': 624, '肩': 625, '稱': 626, '化': 627, '膿': 628, '止': 629, '眠': 630, '強': 631, '換': 632, 'Ｏ': 633, 'Ｋ': 634, '菸': 635, '檳': 636, '榔': 637, '原': 638, '脹': 639, '氣': 640, '鋅': 641, '補': 642, '充': 643, '劑': 644, '經': 645, '紅': 646, '類': 647, '似': 648, '愛': 649, '6': 650, '腳': 651, '何': 652, '決': 653, '復': 654, '慢': 655, '改': 656, '善': 657, '練': 658, '訓': 659, '量': 660, '撇': 661, '筷': 662, '排': 663, '士': 664, '林': 665, '裏': 666, '洗': 667, '腎': 668, '資': 669, '料': 670, '治': 671, '參': 672, '考': 673, '痠': 674, '背': 675, '躺': 676, '彎': 677, '敢': 678, '期': 679, '忙': 680, '記': 681, '敲': 682, '臟': 683, '扭': 684, '蛤': 685, '拉': 686, '網': 687, '激': 688, '烈': 689, '顯': 690, '疲': 691, '勞': 692, '畏': 693, '早': 694, '吹': 695, '風': 696, '·': 697, '哦': 698, '試': 699, '即': 700, '夠': 701, '者': 702, '冷': 703, '搬': 704, '超': 705, '流': 706, '失': 707, '活': 708, '運': 709, '更': 710, '音': 711, '波': 712, '石': 713, '悶': 714, '輸': 715, '塞': 716, '泡': 717, '造': 718, '成': 719, '鈣': 720, '組': 721, '織': 722, '吼': 723, 'D': 724, '控': 725, '制': 726, '懷': 727, '疑': 728, '胖': 729, '公': 730, '斤': 731, '目': 732, '曉': 733, '耶': 734, '壞': 735, '趕': 736, '提': 737, '醒': 738, '領': 739, '速': 740, '拍': 741, '卡': 742, '減': 743, '負': 744, '擔': 745, '無': 746, '步': 747, '喘': 748, '每': 749, '呼': 750, '吸': 751, '均': 752, 'Ｘ': 753, '光': 754, '維': 755, '謹': 756, '慎': 757, '冒': 758, '掃': 759, '描': 760, '判': 761, '亮': 762, '瘤': 763, '建': 764, '議': 765, '詳': 766, '表': 767, '示': 768, '臨': 769, '空': 770, '輕': 771, '捐': 772, '節': 773, 'Ｈ': 774, 'Ｔ': 775, 'Ｌ': 776, 'Ｖ': 777, '測': 778, '歲': 779, '倒': 780, '慮': 781, 'H': 782, '產': 783, '摸': 784, '胸': 785, '精': 786, '範': 787, '圍': 788, '讀': 789, '錢': 790, '您': 791, '煩': 792, '初': 793, '程': 794, '躲': 795, '溫': 796, '和': 797, '認': 798, '陽': 799, '曾': 800, '害': 801, '攔': 802, '束': 803, '千': 804, '癢': 805, '３': 806, '４': 807, '腹': 808, '板': 809, '升': 810, '萬': 811, '唯': 812, '往': 813, '弱': 814, '抵': 815, '移': 816, '植': 817, '恢': 818, '衛': 819, '專': 820, '審': 821, '優': 822, '場': 823, 'X': 824, '痰': 825, '項': 826, '零': 827, '任': 828, '掛': 829, '急': 830, '毛': 831, '囊': 832, '斥': 833, '擦': 834, '終': 835, '派': 836, '乾': 837, '燥': 838, '沖': 839, '爽': 840, '油': 841, '越': 842, '陳': 843, '囉': 844, '待': 845, 'Ｂ': 846, 'Ｃ': 847, '基': 848, '並': 849, '非': 850, '侯': 851, '型': 852, '託': 853, '膽': 854, '根': 855, '除': 856, '件': 857, '趟': 858, '！': 859, '硬': 860, '癌': 861, '危': 862, '丘': 863, '形': 864, '痂': 865, '箭': 866, '龜': 867, '修': 868, '抱': 869, '歉': 870, '削': 871, '膝': 872, '蓋': 873, '掀': 874, '凸': 875, '堆': 876, '抹': 877, '破': 878, '紀': 879, '夢': 880, '膚': 881, '田': 882, '操': 883, 'N': 884, '陸': 885, '幸': 886, '政': 887, '府': 888, '團': 889, '隊': 890, '希': 891, '望': 892, '星': 893, '收': 894, '洛': 895, '亞': 896, '樓': 897, '簽': 898, '籤': 899, '受': 900, '率': 901, '呃': 902, '殺': 903, '死': 904, '頑': 905, '固': 906, '⋯': 907, '忘': 908, '鑑': 909, '頻': 910, '婦': 911, '介': 912, '紹': 913, '泌': 914, '兒': 915, '賀': 916, '爾': 917, '蒙': 918, '隱': 919, '私': 920, '糖': 921, '飲': 922, '甜': 923, '標': 924, '態': 925, '落': 926, '緣': 927, '族': 928, '偏': 929, '脂': 930, '肪': 931, '度': 932, '密': 933, '醇': 934, '煮': 935, '儲': 936, '存': 937, '觀': 938, '習': 939, '慣': 940, '盡': 941, '靠': 942, '潛': 943, '遇': 944, '睛': 945, '週': 946, '避': 947, '碌': 948, '飯': 949, '透': 950, '代': 951, '衡': 952, '碗': 953, '青': 954, '菜': 955, '魚': 956, '炒': 957, '蝦': 958, '芒': 959, '蘋': 960, '餐': 961, '胰': 962, '迫': 963, '飆': 964, '瞬': 965, '伏': 966, '符': 967, '礎': 968, '隨': 969, '錄': 970, '討': 971, '研': 972, '究': 973, '患': 974, '咳': 975, '嗽': 976, 'L': 977, '嬸': 978, '酸': 979, '甘': 980, '炸': 981, '沙': 982, '筋': 983, '閃': 984, '息': 985, '術': 986, '扁': 987, '醉': 988, '鬆': 989, '姿': 990, '勢': 991, '勇': 992, '元': 993, '屎': 994, '火': 995, '龍': 996, '軟': 997, '髮': 998, '葡': 999, '萄': 1000, '櫻': 1001, '桃': 1002, '鐵': 1003, '顏': 1004, '靈': 1005, '刺': 1006, '容': 1007, '器': 1008, '裝': 1009, '櫃': 1010, '台': 1011, '爸': 1012, '膏': 1013, '灌': 1014, '雙': 1015, '樹': 1016, '仙': 1017, '草': 1018, '歐': 1019, '夏': 1020, '貝': 1021, '茶': 1022, '攝': 1023, '腺': 1024, '聖': 1025, '誕': 1026, '延': 1027, '喉': 1028, '嚨': 1029, '鼻': 1030, '擤': 1031, '涕': 1032, '濃': 1033, '淡': 1034, '跡': 1035, '罩': 1036, '孫': 1037, '傳': 1038, '足': 1039, '弓': 1040, '繁': 1041, '殖': 1042, '勒': 1043, '脖': 1044, '腋': 1045, '窩': 1046, '鼠': 1047, '蹊': 1048, '胯': 1049, '嘖': 1050, '濕': 1051, '選': 1052, 's': 1053, 'd': 1054, 'm': 1055, '乎': 1056, '滿': 1057, '勉': 1058, '授': 1059, '訊': 1060, '演': 1061, '秀': 1062, '司': 1063, '栽': 1064, '英': 1065, '文': 1066, '懂': 1067, '模': 1068, '擬': 1069, '向': 1070, '佛': 1071, '碩': 1072, '博': 1073, '材': 1074, '厲': 1075, '闆': 1076, '未': 1077, '系': 1078, '統': 1079, 'b': 1080, '譬': 1081, '推': 1082, 'ㄟ': 1083, 'W': 1084, '布': 1085, 'u': 1086, 'w': 1087, 'h': 1088, '畫': 1089, '鐘': 1090, '寬': 1091, '值': 1092, '海': 1093, '貧': 1094, '史': 1095, '痾': 1096, 'p': 1097, '積': 1098, '癒': 1099, 'y': 1100, '逐': 1101, '漸': 1102, '噬': 1103, '誇': 1104, '棉': 1105, '連': 1106, '廣': 1107, '各': 1108, '壯': 1109, '課': 1110, '擾': 1111, '劍': 1112, '橋': 1113, '某': 1114, '投': 1115, '複': 1116, '暑': 1117, '姑': 1118, '洲': 1119, '冰': 1120, '丈': 1121, '遮': 1122, '霧': 1123, '遠': 1124, '淚': 1125, '休': 1126, '粒': 1127, '拆': 1128, '床': 1129, '凌': 1130, '晨': 1131, '易': 1132, '致': 1133, '助': 1134, '褪': 1135, '利': 1136, '梅': 1137, '騎': 1138, '踏': 1139, '納': 1140, '筆': 1141, '貴': 1142, '環': 1143, '觸': 1144, '盤': 1145, '尼': 1146, '施': 1147, '黴': 1148, 'f': 1149, '交': 1150, '察': 1151, '端': 1152, '诶': 1153, '耐': 1154, '階': 1155, '瓜': 1156, '麵': 1157, '澱': 1158, '粉': 1159, '醣': 1160, '趨': 1161, '位': 1162, '置': 1163, '角': 1164, '探': 1165, '搭': 1166, '飛': 1167, '末': 1168, '噢': 1169, '銷': 1170, '品': 1171, '苗': 1172, '證': 1173, '暸': 1174, '叔': 1175, '將': 1176, '雲': 1177, '緩': 1178, '秋': 1179, '玩': 1180, '康': 1181, '技': 1182, '塗': 1183, '香': 1184, '搗': 1185, '摘': 1186, '女': 1187, '牛': 1188, '奶': 1189, '弄': 1190, '禿': 1191, '嘔': 1192, '雖': 1193, '拐': 1194, '杖': 1195, '撐': 1196, '扶': 1197, '妳': 1198, '姨': 1199, '吩': 1200, '咐': 1201, '藍': 1202, '膠': 1203, '載': 1204, '北': 1205, '市': 1206, '塭': 1207, '孩': 1208, '鄰': 1209, '騙': 1210, '王': 1211, '抓': 1212, '葉': 1213, '估': 1214, '徑': 1215, '尖': 1216, '此': 1217, '侵': 1218, '循': 1219, '擴': 1220, '散': 1221, '淨': 1222, '殘': 1223, '滲': 1224, '殊': 1225, '髒': 1226, '乘': 1227, '孔': 1228, '阻': 1229, '域': 1230, '辨': 1231, '里': 1232, '椅': 1233, '爲': 1234, '淺': 1235, '紙': 1236, '聲': 1237, '厭': 1238, '説': 1239, '麽': 1240, '呐': 1241, '着': 1242, '暈': 1243, '徹': 1244, '辣': 1245, '逆': 1246, '帕': 1247, '金': 1248, '森': 1249, '朋': 1250, '友': 1251, '男': 1252, '緒': 1253, '由': 1254, '暴': 1255, '傾': 1256, '警': 1257, '罵': 1258, '令': 1259, '憂': 1260, '鬱': 1261, '閉': 1262, '狹': 1263, '績': 1264, '桌': 1265, '寄': 1266, '朵': 1267, '花': 1268, '庭': 1269, '羞': 1270, '恥': 1271, '刻': 1272, '脆': 1273, '既': 1274, '牙': 1275, '插': 1276, '吵': 1277, '媳': 1278, '按': 1279, '摩': 1280, '李': 1281, '母': 1282, '爛': 1283, '沉': 1284, '捏': 1285, '吞': 1286, '隔': 1287, '肯': 1288, '責': 1289, '滴': 1290, '願': 1291, '線': 1292, '簿': 1293, '刷': 1294, '帳': 1295, '唉': 1296, '湄': 1297, '爆': 1298, '袋': 1299, '邏': 1300, '輯': 1301, '薄': 1302, '否': 1303, '訂': 1304, '泄': 1305, '暫': 1306, '抬': 1307, '鹹': 1308, '瓶': 1309, '礦': 1310, '泉': 1311, '６': 1312, '０': 1313, '毫': 1314, '扣': 1315, '湯': 1316, '潮': 1317, '灰': 1318, '甲': 1319, '蜂': 1320, '抑': 1321, '益': 1322, '親': 1323, '春': 1324, '痘': 1325, '洞': 1326, '癬': 1327, '故': 1328, '華': 1329, '凝': 1330, '困': 1331, '缺': 1332, '忠': 1333, '惱': 1334, '熟': 1335, '室': 1336, '混': 1337, '識': 1338, '析': 1339, '登': 1340, '吳': 1341, '樟': 1342, '芝': 1343, '彈': 1344, '集': 1345, '肛': 1346, '適': 1347, '糊': 1348, '攪': 1349, '命': 1350, '紛': 1351, '瀑': 1352, '逛': 1353, '圈': 1354, '爬': 1355, '溪': 1356, '木': 1357, '池': 1358, '夫': 1359, '邵': 1360, '承': 1361, '翰': 1362, '執': 1363, '級': 1364, '噴': 1365, '武': 1366, '漢': 1367, '媽': 1368, '限': 1369, '齊': 1370, '痔': 1371, '瘡': 1372, '曬': 1373, '嘴': 1374, '垃': 1375, '圾': 1376, '碰': 1377, '戒': 1378, '聊': 1379, '姐': 1380, '商': 1381, '冬': 1382, '雨': 1383, '源': 1384, '疣': 1385, '戴': 1386, '套': 1387, '談': 1388, '溝': 1389, '併': 1390, '牽': 1391, '涉': 1392, '溶': 1393, '委': 1394, '婉': 1395, '勸': 1396, '賠': 1397, '償': 1398, '救': 1399, '濟': 1400, '署': 1401, '境': 1402, '穿': 1403, '掌': 1404, '典': 1405, '鳳': 1406, '梨': 1407, '飽': 1408, '祕': 1409, '絞': 1410, '土': 1411, '燙': 1412, '使': 1413, '障': 1414, '褲': 1415, '富': 1416, '楷': 1417, '柔': 1418, '倍': 1419, '額': 1420, '依': 1421, '罰': 1422, '脊': 1423, '屏': 1424, '鎮': 1425, '髓': 1426, '律': 1427, '途': 1428, '義': 1429, '敗': 1430, '扎': 1431, '互': 1432, '舉': 1433, '官': 1434, '潦': 1435, '疏': 1436, '導': 1437, '供': 1438, '撥': 1439, '述': 1440, '拷': 1441, '權': 1442, '章': 1443, '蝕': 1444, '肘': 1445, '訴': 1446, '縫': 1447, '膀': 1448, '棒': 1449, '員': 1450, '督': 1451, '促': 1452, '沾': 1453, '黏': 1454, '農': 1455, '曆': 1456, '弛': 1457, '箱': 1458, '笑': 1459, '窗': 1460, '肥': 1461, '亨': 1462, '佔': 1463, '案': 1464, '短': 1465, '陪': 1466, '鍋': 1467, '醬': 1468, '歹': 1469, '漱': 1470, '滅': 1471, '罐': 1472, '妹': 1473, '雜': 1474, '慾': 1475, '梯': 1476, '幅': 1477, '省': 1478, '佐': 1479, 'I': 1480, '搓': 1481, '磨': 1482, '摳': 1483, '痱': 1484, '共': 1485, '季': 1486, '宣': 1487, '檔': 1488, '偶': 1489, '纖': 1490, '碎': 1491, '堅': 1492, '忍': 1493, '蔡': 1494, '漿': 1495, '達': 1496, '酯': 1497, '滷': 1498, '輔': 1499, '浮': 1500, '鞏': 1501, '牌': 1502, '舊': 1503, '版': 1504, '擠': 1505, '榮': 1506, '糟': 1507, '鞋': 1508, '柺': 1509, '漏': 1510, '遺': 1511, '吉': 1512, '韋': 1513, '含': 1514, '鋁': 1515, '鎂': 1516, '牠': 1517, '嗜': 1518, '貪': 1519, '尤': 1520, '忽': 1521, '粥': 1522, '竹': 1523, '炭': 1524, '倦': 1525, '濁': 1526, '氧': 1527, '嚇': 1528, '衆': 1529, '莫': 1530, '燕': 1531, '麥': 1532, '暗': 1533, '靜': 1534, '曲': 1535, '姊': 1536, '勁': 1537, '猛': 1538, '沿': 1539, '擺': 1540, '齦': 1541, '輻': 1542, '育': 1543, '玻': 1544, '璃': 1545, '娃': 1546, '侏': 1547, '儒': 1548, '竄': 1549, '勤': 1550, '奮': 1551, '軌': 1552, '遍': 1553, '痊': 1554, '宗': 1555, '碼': 1556, '惠': 1557, '縱': 1558, '校': 1559, '廖': 1560, '戚': 1561, '哩': 1562, '懶': 1563, '幻': 1564, '嫁': 1565, '氫': 1566, '屁': 1567, '茫': 1568, '丸': 1569, '序': 1570, '架': 1571, '伴': 1572, '侶': 1573, '宿': 1574, '志': 1575, '暖': 1576, '逃': 1577, '炮': 1578, '焦': 1579, '擇': 1580, '求': 1581, '貌': 1582, '釐': 1583, '寧': 1584, '扮': 1585, '喜': 1586, '歡': 1587, '填': 1588, '卷': 1589, '頂': 1590, '礙': 1591, '訪': 1592, '惶': 1593, '恐': 1594, '杯': 1595, '彼': 1596, '宴': 1597, '詢': 1598, '咧': 1599, '釋': 1600, '桶': 1601, 'Y': 1602, '匿': 1603, '篩': 1604, '格': 1605, '鬧': 1606, '滋': 1607, '懼': 1608, '浩': 1609, '址': 1610, '帥': 1611, '首': 1612, '錶': 1613, '展': 1614, 'U': 1615, '遊': 1616, '戲': 1617, '瞌': 1618, '偷': 1619, '砲': 1620, '噁': 1621, '咖': 1622, '啡': 1623, '驚': 1624, '聯': 1625, '絡': 1626, '膩': 1627, '悉': 1628, '與': 1629, '拔': 1630, '迷': 1631, '措': 1632, '矛': 1633, '盾': 1634, '歸': 1635, '潢': 1636, '熬': 1637, '羨': 1638, '慕': 1639, '興': 1640, '評': 1641, 'Ｉ': 1642, '９': 1643, '耗': 1644, '臉': 1645, '具': 1646, '咦': 1647, '碑': 1648, '努': 1649, '塑': 1650, '槍': 1651, '疙': 1652, '瘩': 1653, '哎': 1654, '革': 1655, '賣': 1656, '盒': 1657, '珍': 1658, '呦': 1659, '際': 1660, '干': 1661, '兇': 1662, '稽': 1663, '縣': 1664, '藏': 1665, '覆': 1666, '愉': 1667, '悅': 1668, '樂': 1669, '拒': 1670, '極': 1671, '尋': 1672, '諮': 1673, '癮': 1674, '鬼': 1675, '趣': 1676, '刪': 1677, '協': 1678, '壤': 1679, '韓': 1680, '儘': 1681, '２': 1682, '１': 1683, 'Ｎ': 1684, 'Ｅ': 1685, '輩': 1686, '答': 1687, '捲': 1688, '價': 1689, '購': 1690, '矩': 1691, '露': 1692, '鍵': 1693, '溯': 1694, '秒': 1695, '蹭': 1696, '丟': 1697, '割': 1698, '聼': 1699, '葩': 1700, '閡': 1701, '瘙': 1702, '熊': 1703, '饋': 1704, '内': 1705, '款': 1706, '披': 1707, '瞭': 1708, '怖': 1709, '嘉': 1710, '距': 1711, '封': 1712, '誠': 1713, '呂': 1714, '嗨': 1715, '括': 1716, '尷': 1717, '尬': 1718, '杜': 1719, '蕾': 1720, '岡': 1721, '傻': 1722, '永': 1723, '攸': 1724, '=': 1725, '擋': 1726, '７': 1727, '５': 1728, '店': 1729, '職': 1730, '傍': 1731, '糾': 1732, '截': 1733, '盯': 1734, '曼': 1735, '頓': 1736, '凍': 1737, '鏢': 1738, '唔': 1739, '妙': 1740, '阪': 1741, '并': 1742, '及': 1743, '涼': 1744, '慌': 1745, '盛': 1746, '准': 1747, '頗': 1748, '乖': 1749, '箝': 1750, '監': 1751, '豆': 1752, '貸': 1753, '債': 1754, '愧': 1755, '疚': 1756, '戀': 1757, '逼': 1758, '借': 1759, '棄': 1760, '挺': 1761, '貨': 1762, '勾': 1763, '憐': 1764, '賺': 1765, '攤': 1766, '檻': 1767, '煙': 1768, '拋': 1769, '握': 1770, '灣': 1771, '划': 1772, '嘗': 1773, '偽': 1774, '世': 1775, '界': 1776, '景': 1777, '旅': 1778, '默': 1779, '予': 1780, '郵': 1781, '朝': 1782, '替': 1783, '渣': 1784, '妖': 1785, '魔': 1786, '邀': 1787, '飄': 1788, '陌': 1789, '立': 1790, '拼': 1791, '創': 1792, '箋': 1793, '嘍': 1794, '慘': 1795, '澳': 1796, '座': 1797, '央': 1798, '淆': 1799, '頁': 1800, '威': 1801, '鋼': 1802, '鈕': 1803, '晦': 1804, '語': 1805, '拚': 1806, '喬': 1807, 'F': 1808, '糕': 1809, '．': 1810, '允': 1811, '摟': 1812, '禍': 1813, '跌': 1814, '疊': 1815, '譚': 1816, '京': 1817, '宇': 1818, '智': 1819, '列': 1820, '群': 1821, '廢': 1822, '禦': 1823, '唾': 1824, '雄': 1825, '蘭': 1826, '野': 1827, '柳': 1828, '弧': 1829, '映': 1830, '沫': 1831, '仍': 1832, '煉': 1833, '聚': 1834, '酐': 1835, '憶': 1836, '滑': 1837, '味': 1838, '獎': 1839, '夾': 1840, '鏈': 1841, '犯': 1842, '「': 1843, '」': 1844, '咯': 1845, '獲': 1846, '后': 1847, '臺': 1848, '厰': 1849, '村': 1850, '返': 1851, '潔': 1852, '癖': 1853, '欣': 1854, '賞': 1855, '潤': 1856, '佈': 1857, '億': 1858, '鼓': 1859, '傑': 1860, '徵': 1861, '挫': 1862, '枝': 1863, '紫': 1864, '堵': 1865, '薦': 1866, '寮': 1867, '彰': 1868, '戰': 1869, '藉': 1870, '吻': 1871, '玄': 1872, '尾': 1873, '賭': 1874, '挪': 1875, '曝': 1876, '曡': 1877, '嵗': 1878, '氛': 1879, '踢': 1880, '%': 1881, '﹑': 1882, '若': 1883, '橫': 1884, '鎖': 1885, '繫': 1886, '溼': 1887, '庫': 1888, '啓': 1889, '孝': 1890, '弊': 1891, '嘞': 1892, '涵': 1893, '享': 1894, '_': 1895, '耳': 1896, 'Ａ': 1897, '井': 1898, '壽': 1899, 'Ｒ': 1900, 'Ｐ': 1901, '豐': 1902, 'Ｙ': 1903, '嚏': 1904, '爺': 1905, '烘': 1906, '焙': 1907, '餅': 1908, '園': 1909, '樞': 1910, '磁': 1911, '振': 1912, '蹦': 1913, '眩': 1914, '疤': 1915, 'Ｄ': 1916, '凹': 1917, '槽': 1918, '綿': 1919, '屋': 1920, '盲': 1921, '譜': 1922, '儀': 1923, '糞': 1924, '挖': 1925, '繳': 1926, '尚': 1927, '禁': 1928, '餃': 1929, '宜': 1930, '錠': 1931, '荷': 1932, '肢': 1933, '搥': 1934, '欠': 1935, '漲': 1936, 'Ｇ': 1937, 'Ｍ': 1938, '劇': 1939, '躍': 1940, '童': 1941, '幼': 1942, '稚': 1943, '渾': 1944, '噩': 1945, 'Ｓ': 1946, '鹼': 1947, '彩': 1948, '齒': 1949, '曹': 1950, '蠢': 1951, '欲': 1952, '川': 1953, '龔': 1954, '鮮': 1955, '托': 1956, '抿': 1957, '蜜': 1958, '螃': 1959, '蟹': 1960, '斟': 1961, '酌': 1962, '米': 1963, '倫': 1964, '狗': 1965, '舌': 1966, '狼': 1967, '勝': 1968, '娜': 1969, '索': 1970, '隆': 1971, '釘': 1972, '燃': 1973, '稅': 1974, '郎': 1975, '雞': 1976, '穀': 1977, '糧': 1978, '饅': 1979, '爭': 1980, '搶': 1981, '瓣': 1982, '腴': 1983, '貼': 1984, '踩': 1985, '摺': 1986, '蓮': 1987, '寶': 1988, '梢': 1989, '橢': 1990, '蒜': 1991, '椎': 1992, '揍': 1993, '笨': 1994, '娘': 1995, '杉': 1996, '磯': 1997, '莊': 1998, '豪': 1999, '鄉': 2000, '瞞': 2001, '浸': 2002, '廁': 2003, '宮': 2004, '頸': 2005, '構': 2006, '摔': 2007, '洪': 2008, '澤': 2009, '恭': 2010, '酵': 2011, '憊': 2012, '唄': 2013, '矽': 2014, '晶': 2015, '脾': 2016, '奈': 2017, '娛': 2018, '寡': 2019, '契': 2020, '夕': 2021, '裸': 2022, '嘶': 2023, '幣': 2024, '策': 2025, '衍': 2026, '雅': 2027, '瞧': 2028, '南': 2029, '姓': 2030, '乏': 2031, '游': 2032, '泳': 2033, '堤': 2034, '腕': 2035, '匆': 2036, '辰': 2037, '猴': 2038, '斜': 2039, '祖': 2040, '釣': 2041, '撿': 2042, '污': 2043, '言': 2044, '挨': 2045, '閒': 2046, '蕭': 2047, '-': 2048, '煞': 2049, '惰': 2050, '役': 2051, 'Ｕ': 2052, '麗': 2053, '邂': 2054, '逅': 2055, '悟': 2056, '稠': 2057, 'Ｆ': 2058, '狂': 2059, '廝': 2060, '守': 2061, '株': 2062, '檯': 2063, '勵': 2064, '恰': 2065, '獨': 2066, '峰': 2067, '廟': 2068, '融': 2069, '諱': 2070, '銀': 2071, '悔': 2072, '折': 2073, '喲': 2074, '蛛': 2075, '絲': 2076, '蒐': 2077, '瘋': 2078, '澄': 2079, '昇': 2080, '墊': 2081, '尺': 2082, '兼': 2083, '篇': 2084, '慧': 2085, '酷': 2086, 'j': 2087, '琉': 2088, '掰': 2089, '魂': 2090, '詞': 2091, '憑': 2092, '德': 2093, '瑞': 2094, '販': 2095, '售': 2096, '街': 2097, '呆': 2098, '繞': 2099, '濾': 2100, '庚': 2101, '瘀': 2102, '矮': 2103, '跨': 2104, '撲': 2105, '仿': 2106, '酪': 2107, '乳': 2108, '窄': 2109, '虎': 2110, '餘': 2111, '疝': 2112, '磷': 2113, '堪': 2114, '卵': 2115, '孕': 2116, '盂': 2117, 'z': 2118, '唷': 2119, '腱': 2120, '蠕': 2121, '釜': 2122, '齡': 2123, '拾': 2124, '靶': 2125, '援': 2126, '搖': 2127, '祂': 2128, '惚': 2129, '婆': 2130, '咪': 2131, '駕': 2132, '災': 2133, '股': 2134, '杏': 2135, '輝': 2136, '悠': 2137, '揮': 2138, '盆': 2139, '皇': 2140, '魯': 2141, '餓': 2142, '耍': 2143, '趾': 2144, '剪': 2145, '巫': 2146, '癱': 2147, '裂': 2148, '趴': 2149, '憩': 2150, 'ｓ': 2151, 'ｕ': 2152, 'ｂ': 2153, 'ｔ': 2154, 'ａ': 2155, 'ｉ': 2156, 'ｏ': 2157, 'ｎ': 2158, '悸': 2159, '昂': 2160, '栓': 2161, '邁': 2162, '丁': 2163, '彙': 2164, '鋒': 2165, '伸': 2166, '甩': 2167, '螢': 2168, '幕': 2169, '澡': 2170, '竇': 2171, '巡': 2172, '搔': 2173, '仔': 2174, '鋪': 2175, '押': 2176, '船': 2177, '聞': 2178, '颱': 2179, '鳴': 2180, '咿': 2181, '繃': 2182, '墅': 2183, '噪': 2184, '錘': 2185, '播': 2186, '誘': 2187, '昧': 2188, '蝨': 2189, '覽': 2190, '孵': 2191, '芽': 2192, '秤': 2193, '斬': 2194, '攻': 2195, '擊': 2196, '撫': 2197, '製': 2198, '濫': 2199, '贊': 2200, '銜': 2201, '閨': 2202, '刮': 2203, '鬍': 2204, '叉': 2205, '恕': 2206, '亡': 2207, '矜': 2208, '諸': 2209, '貞': 2210, '坊': 2211, '燈': 2212, '句': 2213, '競': 2214, '旺': 2215, '批': 2216, '趁': 2217, '巧': 2218, '卻': 2219, '搜': 2220, '鋌': 2221, '稀': 2222, '唱': 2223, '歌': 2224, '宵': 2225, '搏': 2226, '噎': 2227, '欄': 2228, '塵': 2229, '嗅': 2230, '喪': 2231, '怠': 2232, '畜': 2233, '牧': 2234, '瘟': 2235, '肋': 2236, '粗': 2237, '橡': 2238, '俊': 2239, '盟': 2240, '濺': 2241, '鈉': 2242, '鉀': 2243, '獻': 2244, '柏': 2245, '霉': 2246, '敷': 2247, '蕁': 2248, '鉤': 2249, '萱': 2250, '淤': 2251, '嫌': 2252, '燜': 2253, '蔬': 2254, '姪': 2255, '添': 2256, '渴': 2257, '烏': 2258, '洱': 2259, '啞': 2260, '嚥': 2261, '嗝': 2262, '鑿': 2263, '迴': 2264, '蹋': 2265, '遲': 2266, '鈍': 2267, '朗': 2268, '槌': 2269, '廳': 2270, '廚': 2271, '汽': 2272, '喊': 2273, '芳': 2274, '珠': 2275, '僵': 2276, '楊': 2277, '喂': 2278, '編': 2279, '略': 2280, '猶': 2281, '豫': 2282, '逝': 2283, '籃': 2284, '撕': 2285, '渡': 2286, '棟': 2287, '衰': 2288, '踴': 2289, '讚': 2290, '/': 2291, '劉': 2292, '捷': 2293, '‧': 2294, '徘': 2295, '徊': 2296, '鍛': 2297, '鍊': 2298, '絨': 2299, '碘': 2300, '痕': 2301, '淒': 2302, '頰': 2303, '偉': 2304, '哲': 2305, '愈': 2306, '凡': 2307, '咽': 2308, '苔': 2309, '梗': 2310, '扯': 2311, '顴': 2312, '秘': 2313, '奎': 2314, '玉': 2315, 'ｋ': 2316, '噻': 2317, '徐': 2318, '揭': 2319, '萍': 2320, '灶': 2321, '蛀': 2322, '鄭': 2323, '捨': 2324, '佳': 2325, '拳': 2326, '胡': 2327, '賴': 2328, '剝': 2329, '怡': 2330, '痙': 2331, '攣': 2332, '臭': 2333, '蘆': 2334, '忌': 2335, '苓': 2336, '嗆': 2337, '碳': 2338, '垂': 2339, '鍾': 2340, '宏': 2341, '屜': 2342, '亭': 2343, '攙': 2344, '兄': 2345, '籬': 2346, '婷': 2347, '凱': 2348, '催': 2349, '冠': 2350, '招': 2351, '姆': 2352, '挂': 2353, '財': 2354, '牡': 2355, '蠣': 2356, '董': 2357, '脱': 2358, '碟': 2359, '伍': 2360, '鱔': 2361, '羹': 2362, '咻': 2363, '港': 2364, '胱': 2365, '斌': 2366, '垢': 2367, '熙': 2368, '芭': 2369, '艾': 2370, '萊': 2371, '躁': 2372, '筍': 2373, '豬': 2374, '糙': 2375, '咕': 2376, '嚕': 2377, '抄': 2378, '塔': 2379, '醃': 2380, '漬': 2381, '腐': 2382, '亢': 2383, '涂': 2384, '霖': 2385, '唇': 2386, '癲': 2387, '綁': 2388, 'ｆ': 2389, 'ｅ': 2390, 'ｒ': 2391, 'ｃ': 2392, 'ｍ': 2393, 'ｐ': 2394, 'ｌ': 2395, 'ｗ': 2396, 'ｈ': 2397, 'ｙ': 2398, '胺': 2399, '驅': 2400, '惜': 2401, '軍': 2402, '皂': 2403, '沐': 2404, '浴': 2405, '檸': 2406, '檬': 2407, '薑': 2408, '汁': 2409, '醋': 2410, '瓢': 2411, '疔': 2412, '唸': 2413, '嬰': 2414, '吊': 2415, '寵': 2416, '慈': 2417, '翹': 2418, '枕': 2419, '席': 2420, '吱': 2421, '胎': 2422, '疸': 2423, '癇': 2424, '歪': 2425, '糰': 2426, '揪': 2427, '毯': 2428, '戳': 2429, '汎': 2430, '傅': 2431, '唆': 2432, '恆': 2433, '妨': 2434, '棋': 2435, '贅': 2436, '薪': 2437, '沈': 2438, '漂': 2439, '紗': 2440, '寢': 2441, '賓': 2442, '嗦': 2443, '撮': 2444, '欽': 2445, '茄': 2446, '耀': 2447, '仁': 2448, '枚': 2449, '墾': 2450, '貢': 2451, '湖': 2452, '暢': 2453, '竭': 2454, '巾': 2455, '厠': 2456, '邦': 2457, '奧': 2458, '愫': 2459, '崩': 2460, '臀': 2461, '騷': 2462, '貓': 2463, '妝': 2464, '蔻': 2465, '蕩': 2466, '哭': 2467, '凶': 2468, '撩': 2469, '晉': 2470, '裕': 2471, '蒸': 2472, '烤': 2473, '惑': 2474, '鹿': 2475, '謀': 2476, '呈': 2477, '搂': 2478, '屯': 2479, '泛': 2480, '孳': 2481, '騰': 2482, '谷': 2483, '潑': 2484, '洋': 2485, '胚': 2486, '擁': 2487}
tag2id = {'O': 0, 'B-time': 1, 'I-time': 2, 'B-location': 3, 'I-location': 4, 'B-med_exam': 5, 'I-med_exam': 6, 'B-profession': 7, 'I-profession': 8, 'B-name': 9, 'I-name': 10, 'B-family': 11, 'I-family': 12, 'B-ID': 13, 'I-ID': 14, 'B-clinical_event': 15, 'I-clinical_event': 16, 'B-education': 17, 'I-education': 18, 'B-money': 19, 'I-money': 20, 'B-contact': 21, 'I-contact': 22, 'B-organization': 23, 'I-organization': 24, 'B-others': 25, 'I-others': 26}

In [15]:
BiLSTMCRF_MODEL_PATH = './SavePkl/Temp/1227-2100 (0.68)/bilstm_crf.pkl'
REMOVE_O = False 

print("Load File...")
# train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train", make_vocab=False)
# dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

print("Testing...")
crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
bilstm_model = load_model(BiLSTMCRF_MODEL_PATH)
bilstm_model.model.bilstm.bilstm.flatten_parameters()  # remove warning
test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(test_word_lists, test_tag_lists, test=True)
lstmcrf_pred, target_tag_list = bilstm_model.test(test_word_lists, test_tag_lists, crf_word2id, crf_tag2id)

print("article_id\tstart_position\tend_position\tentity_text\tentity_type")
output = ""
start = ""
end = ""
tag = ""
line = ""
output += "article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
flag = 0
for chap in range(len(test_word_lists)):
    for text in range(len(test_word_lists[chap][:-1])):
        if lstmcrf_pred[chap][text][0] == 'B':            
            if line != "":
                end = text
                if end-start == len(line):
                    output += "{:}\t{:}\t{:}\t{:}\t{:}\n".format(chap, start, end, line, tag)
                    print("{:}\t{:}\t{:}\t{:}\t{:}".format(chap, start, end, line, tag))
                flag = 0
            start = text            
            tag = lstmcrf_pred[chap][text][2:]
            line = ""
            line += str(test_word_lists[chap][text])
            flag = 1
        elif lstmcrf_pred[chap][text][0] == 'I':
            if flag == 0:
                start = text
                tag = lstmcrf_pred[chap][text][2:]
                flag = 1
            line += str(test_word_lists[chap][text])
        elif lstmcrf_pred[chap][text][0] == 'O':
            if line != "":
                end = text
                if end-start == len(line):
                    output += "{:}\t{:}\t{:}\t{:}\t{:}\n".format(chap, start, end, line, tag)
                    print("{:}\t{:}\t{:}\t{:}\t{:}".format(chap, start, end, line, tag))
                line = ""
                flag = 0
output_path = 'output.tsv'
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output)


Load File...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Testing...


  self.num_layers, self.dropout, self.training, self.bidirectional)


article_id	start_position	end_position	entity_text	entity_type
0	198	200	新樓	location
0	227	229	麻豆	location
0	237	239	麻豆	location
0	295	297	今年	time
1	31	33	今天	time
1	40	42	昨天	time
1	76	80	0.79	med_exam
1	94	96	88	med_exam
1	179	181	20	time
1	182	184	19	time
1	258	261	2.6	med_exam
1	360	363	黃醫師	name
1	366	369	6.6	med_exam
1	385	388	6.6	med_exam
1	394	397	8.3	med_exam
1	407	410	8.3	med_exam
1	414	421	四個月前8.3	time
1	463	466	6.2	med_exam
1	469	472	三個月	time
1	626	629	黃醫師	name
1	670	673	格里曼	location
1	721	723	里曼	location
1	742	744	中午	time
1	755	757	里曼	location
1	867	869	21	time
1	873	875	21	time
1	881	884	12天	time
1	890	893	12天	time
1	902	906	三個星期	time
2	170	174	1100	med_exam
2	245	249	1145	med_exam
2	275	278	500	money
2	285	288	500	money
2	390	393	三個月	time
2	402	405	三個月	time
2	681	683	57	med_exam
2	699	701	52	med_exam
2	792	795	兩個月	time
2	1090	1094	7.56	med_exam
2	1151	1155	450塊	money
2	1239	1241	35	med_exam
2	1246	1248	35	med_exam
2	1394	1398	450塊	money
3	6	8	昨天	time
3	11	13	一天	time
3	45	47

119	288	290	七天	time
119	725	727	晚上	time
120	6	10	這一個月	time
120	126	129	兩年多	time
120	137	140	兩年多	time
120	282	285	蔡醫師	name
120	1121	1125	陽明醫院	location
120	1736	1738	今天	time
120	1768	1780	www.prep.com	contact
120	1843	1847	第一個月	time
120	1881	1884	下個月	time
120	1901	1904	下個月	time
120	1935	1939	下一個月	time
120	1994	1997	這個月	time
120	2156	2160	下一個月	time
120	2179	2182	上個月	time
120	2197	2200	下個月	time
120	2437	2440	兩三天	time
120	2764	2768	這個禮拜	time
120	2871	2873	晚上	time
120	3030	3032	今天	time
120	3033	3036	禮拜五	time
120	3044	3048	下禮拜五	time
120	3053	3055	六日	time
120	3193	3195	今天	time
120	3196	3199	禮拜四	time
120	3404	3407	前一天	time
120	3610	3613	禮拜五	time
120	3634	3637	五六日	time
120	3685	3689	這五六日	time
120	3720	3723	五六日	time
120	3752	3754	今天	time
120	3787	3789	今天	time
120	3839	3841	八點	time
120	3859	3861	八點	time
120	3867	3869	八點	time
120	3883	3885	八點	time
120	3923	3925	兩天	time
120	3955	3959	24小時	time
121	4	7	這個月	time
121	997	1005	28937019	time
122	4	7	這個月	time
122	237	239	9月	time
122	271	274	這個月	time
122	3