In [171]:
from fastNLP import DataSet
import torch
from fastNLP.io import DataBundle
from fastNLP import Vocabulary

bmes_data_path='cws_bmes/'
train_path='train_dataset/'
dev_path='dev_dataset/'
test_path='test_dataset/'

def get_data_bmes(dataset):
    path=bmes_data_path+dataset+'.txt'
    data={'raw_chars':[],'target':[],'seq_len':[],'corpus':[],'chars':[]}
    with open(path,encoding='UTF-8') as file:
        raw_sentence=[]
        tags=[]
        for line in file:
            if line=='\n' and len(raw_sentence)>0:
                data['raw_chars'].append(''.join(raw_sentence[1:-1]))
                data['target'].append(tags[1:-1])
                data['seq_len'].append(len(tags)-2)
                data['corpus'].append('CWS-'+raw_sentence[0][1:-1])
                data['chars'].append(raw_sentence[1:-1])
                raw_sentence=[]
                tags=[]
            else:
                word,tag=line.strip().split('\t')
                raw_sentence.append(word)
                tags.append(tag)
        data=DataSet(data)
        all_datasets={}
        vocab=Vocabulary(padding=None,unknown=None)
        vocab.from_dataset(data,field_name='corpus')
        for corpus in vocab:
            all_datasets[corpus[0]]=DataSet()
        for instance in data:
            all_datasets[instance['corpus']].append(instance)
        return all_datasets


train_datasets=get_data_bmes('train')
for key in train_datasets:
    dataset=train_datasets[key]
    torch.save(dataset,train_path+key)
    
dev_datasets=get_data_bmes('dev')
for key in dev_datasets:
    dataset=dev_datasets[key]
    torch.save(dataset,dev_path+key)
    
test_datasets=get_data_bmes('test')
for key in test_datasets:
    dataset=test_datasets[key]
    torch.save(dataset,test_path+key)

In [5]:
from fastNLP import DataSet
import torch
from fastNLP.io import DataBundle
from fastNLP import Vocabulary
from fastNLP.io import CTBLoader
import re

bmes_data_path='cws_bmes/'
train_path='train_dataset/'
dev_path='dev_dataset/'
test_path='test_dataset/'

def normalize(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  # 全角空格直接转换
            inside_code = 32
        elif 65281 <= inside_code <= 65374:  # 全角字符（除空格）根据关系转化
            inside_code -= 65248

        rstring += chr(inside_code)
    return rstring


def process(instance):
    rNUM = '(-|\+)?\d+((\.|·)\d+)?%?'
    rENG = '[A-Za-z_.]+'
    new_sent = []
    for word in instance['raw_words']:
        word=normalize(word)
        word = re.sub('\s+', '', word, flags=re.U)
        word = re.sub(rNUM, '0', word, flags=re.U)
        word = re.sub(rENG, 'X', word)
        new_sent.append(word)
    return new_sent

def add_target(instance):
    pos=instance['pos']
    raw_words=instance['raw_words']
    target=[]
    for i in range(len(pos)):
        l=len(raw_words[i])
        if l==1:
            target.append('S-'+pos[i])
        else:
            target.append('B-'+pos[i])
            for j in range(l-2):
                target.append('M-'+pos[i])
            target.append('E-'+pos[i])
    return target

def add_raw_chars(instance):
    raw_chars=''.join(instance['raw_words'])
    return raw_chars

def add_chars(instance):
    chars=list(instance['raw_chars'])
    return chars

def process_dataset(dataset):
    dataset.delete_field('dep_head')
    dataset.delete_field('dep_label')
    dataset.apply(process,new_field_name='raw_words')
    dataset.apply(add_target,new_field_name='target')
    dataset.delete_field('pos')
    dataset.apply(add_raw_chars,new_field_name='raw_chars')
    dataset.apply(add_chars,new_field_name='chars')
    dataset.delete_field('raw_words')
    dataset.apply(lambda x:len(x['raw_chars']),new_field_name='seq_len')
    max_len=0
    for instance in dataset:
        if len(instance['target'])!=len(instance['chars']):
            print('error',instance)
        max_len=max(max_len,len(instance['chars']))
        if len(instance['target'])==418:
            print(instance)
    return dataset

def load_ctb(path):
    data=CTBLoader().load(path)
    for key in data.datasets:
        data.datasets[key]=process_dataset(data.datasets[key])
        data.datasets[key].apply(lambda x:'POS-'+str(path)[:4],new_field_name='corpus')
    return data

path_list=['ctb5','ctb7','ctb9']
for path in path_list:
    print('start process'+path)
    databundle=load_ctb(path)
    torch.save(databundle.datasets['train'],train_path+'POS-'+path)
    torch.save(databundle.datasets['test'],test_path+'POS-'+path)
    torch.save(databundle.datasets['dev'],dev_path+'POS-'+path)

start processctb5
max_len 149
max_len 272
+---------------------+----------------------+---------------------+---------+
| target              | raw_chars            | chars               | seq_len |
+---------------------+----------------------+---------------------+---------+
| ['B-VV', 'M-VV',... | 普天同庆(景泰蓝瓶... | ['普', '天', '同... | 418     |
+---------------------+----------------------+---------------------+---------+
max_len 418
start processctb7
max_len 403
max_len 276
+---------------------+----------------------+---------------------+---------+
| target              | raw_chars            | chars               | seq_len |
+---------------------+----------------------+---------------------+---------+
| ['B-VV', 'M-VV',... | 普天同庆(景泰蓝瓶... | ['普', '天', '同... | 418     |
+---------------------+----------------------+---------------------+---------+
max_len 418
start processctb9
max_len 333
max_len 290
+---------------------+----------------------+---------------------+---------+
| t

In [41]:
from fastNLP import DataSet
import torch
from fastNLP.io import DataBundle
from fastNLP import Vocabulary
from fastNLP.io import MsraNERLoader
import re

bmes_data_path='cws_bmes/'
train_path='train_dataset/'
dev_path='dev_dataset/'
test_path='test_dataset/'

In [39]:
msra=MsraNERLoader().load('MSRA')

def normalize(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  # 全角空格直接转换
            inside_code = 32
        elif 65281 <= inside_code <= 65374:  # 全角字符（除空格）根据关系转化
            inside_code -= 65248

        rstring += chr(inside_code)
    return rstring


def process(instance):
    rNUM = '(-|\+)?\d+((\.|·)\d+)?%?'
    rENG = '[A-Za-z_.]+'
    new_sent = []
    for word in instance['chars']:
        word=normalize(word)
        word = re.sub(rNUM, '0', word, flags=re.U)
        word = re.sub(rENG, 'X', word)
        new_sent.append(word)
    return new_sent

for key in msra.datasets:
    msra.datasets[key].rename_field('raw_chars','chars')
    msra.datasets[key].apply(process,new_field_name='chars')
    msra.datasets[key].apply(lambda ins:''.join(ins['chars']),new_field_name='raw_chars')
    msra.datasets[key].apply(lambda ins:len(ins['chars']),new_field_name='seq_len')
    msra.datasets[key].apply(lambda ins:'NER-msra',new_field_name='corpus')

torch.save(msra.datasets['test'],test_path+'NER-msra')


In [40]:
msra.datasets

{'test': +------------------+------------------+-------------------+---------+----------+
 | target           | chars            | raw_chars         | seq_len | corpus   |
 +------------------+------------------+-------------------+---------+----------+
 | ['B-NT', 'M-N... | ['中', '共', ... | 中共中央致中国... | 16      | NER-msra |
 | ['O', 'O', 'O... | ['各', '位', ... | 各位代表、各位... | 10      | NER-msra |
 | ['O', 'B-NT',... | ['在', '中', ... | 在中国致公党第... | 53      | NER-msra |
 | ['O', 'O', 'O... | ['致', '以', ... | 致以亲切的问候... | 8       | NER-msra |
 | ['O', 'O', 'O... | ['在', '过', ... | 在过去的五年中... | 69      | NER-msra |
 | ['O', 'O', 'O... | ['高', '举', ... | 高举爱国主义和... | 62      | NER-msra |
 | ['O', 'O', 'O... | ['紧', '紧', ... | 紧紧围绕国家的... | 106     | NER-msra |
 | ['B-NT', 'M-N... | ['致', '公', ... | 致公党中央领导... | 65      | NER-msra |
 | ['O', 'O', 'O... | ['广', '大', ... | 广大成员在做好... | 86      | NER-msra |
 | ['O', 'O', 'O... | ['结', '合', ... | 结合自身的特点... | 53      | NER-msra |
 | ['O', 'O', 'O

In [38]:
msra.datasets

{'test': +------------------------------------+-------------------------------------+
 | target                             | chars                               |
 +------------------------------------+-------------------------------------+
 | ['B-NT', 'M-NT', 'M-NT', 'E-NT'... | ['中', '共', '中', '央', '致', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['各', '位', '代', '表', '、', ...  |
 | ['O', 'B-NT', 'M-NT', 'M-NT', '... | ['在', '中', '国', '致', '公', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['致', '以', '亲', '切', '的', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['在', '过', '去', '的', '五', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['高', '举', '爱', '国', '主', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['紧', '紧', '围', '绕', '国', ...  |
 | ['B-NT', 'M-NT', 'M-NT', 'M-NT'... | ['致', '公', '党', '中', '央', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['广', '大', '成', '员', '在', ...  |
 | ['O', 'O', 'O', 'O', 'O', 'O', ... | ['结', '合', '自', '身', '的', ...  |
 | ['O', 'O', 'O', 'B-NT', '