In [1]:
import pycrfsuite
import numpy as np
from itertools import chain
from sklearn.metrics import classification_report,confusion_matrix
import sklearn
from sklearn.preprocessing import LabelBinarizer

## 加载训练数据

In [2]:
text = open('../outputs/source_BIO_2014_cropus.txt').read()
target = open('../outputs/target_BIO_2014_cropus.txt').read()

In [3]:
sentences = []
sent = text.split('\n')
for s in sent:
    sentences.append(s.split(" "))
"""
[[sent0],[sent1],[sent2]]
"""

'\n[[sent0],[sent1],[sent2]]\n'

In [4]:
targets = []
target = target.split('\n')
for t in target:
    targets.append(t.split(" "))

In [5]:
assert len(targets[0])==len(sentences[0]), 'not equal'

In [6]:
print('sentences:',sentences[0])

sentences: ['人', '民', '网', '1', '月', '1', '日', '讯', '据', '《', '纽', '约', '时', '报', '》', '报', '道', ',', '美', '国', '华', '尔', '街', '股', '市', '在', '2', '0', '1', '3', '年', '的', '最', '后', '一', '天', '继', '续', '上', '涨', ',', '和', '全', '球', '股', '市', '一', '样', ',', '都', '以', '最', '高', '纪', '录', '或', '接', '近', '最', '高', '纪', '录', '结', '束', '本', '年', '的', '交', '易', '。']


In [7]:
print('targets:',targets[0])

targets: ['O', 'O', 'O', 'B_T', 'I_T', 'I_T', 'I_T', 'O', 'O', 'O', 'B_LOC', 'I_LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B_LOC', 'I_LOC', 'I_LOC', 'I_LOC', 'I_LOC', 'O', 'O', 'O', 'B_T', 'I_T', 'I_T', 'I_T', 'I_T', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


##  基于字符的CRF

In [8]:
# 句子加入分割符号
# <BOS>：一句话的开头
# <EOS>: 一句话的结尾
def sent_sep(array):
    new_array = []
    for sent in array:
        sent.insert(0,'<BOS>')
        sent.append('<EOS>')
        new_array.append(sent)
    return new_array
new_sentences = sent_sep(sentences)

In [9]:
def seg_by_window(sent,window=3):
    """采用滑动窗口截取句子，默认窗口大小为3，方便后面特征提取"""
    n = len(sent)
    flag = 0
    new_sent = []
    while flag < n-window+1:
        new_sent.append(sent[flag:flag+window])
        flag += 1
    return new_sent

sentences = []
for sent in new_sentences:
    item = seg_by_window(sent)
    sentences.append(item)
sentences[0]

[['<BOS>', '人', '民'],
 ['人', '民', '网'],
 ['民', '网', '1'],
 ['网', '1', '月'],
 ['1', '月', '1'],
 ['月', '1', '日'],
 ['1', '日', '讯'],
 ['日', '讯', '据'],
 ['讯', '据', '《'],
 ['据', '《', '纽'],
 ['《', '纽', '约'],
 ['纽', '约', '时'],
 ['约', '时', '报'],
 ['时', '报', '》'],
 ['报', '》', '报'],
 ['》', '报', '道'],
 ['报', '道', ','],
 ['道', ',', '美'],
 [',', '美', '国'],
 ['美', '国', '华'],
 ['国', '华', '尔'],
 ['华', '尔', '街'],
 ['尔', '街', '股'],
 ['街', '股', '市'],
 ['股', '市', '在'],
 ['市', '在', '2'],
 ['在', '2', '0'],
 ['2', '0', '1'],
 ['0', '1', '3'],
 ['1', '3', '年'],
 ['3', '年', '的'],
 ['年', '的', '最'],
 ['的', '最', '后'],
 ['最', '后', '一'],
 ['后', '一', '天'],
 ['一', '天', '继'],
 ['天', '继', '续'],
 ['继', '续', '上'],
 ['续', '上', '涨'],
 ['上', '涨', ','],
 ['涨', ',', '和'],
 [',', '和', '全'],
 ['和', '全', '球'],
 ['全', '球', '股'],
 ['球', '股', '市'],
 ['股', '市', '一'],
 ['市', '一', '样'],
 ['一', '样', ','],
 ['样', ',', '都'],
 [',', '都', '以'],
 ['都', '以', '最'],
 ['以', '最', '高'],
 ['最', '高', '纪'],
 ['高', '纪', '录'],
 ['纪', '录', '或'],
 ['录',

In [11]:
def feature_dict(item):
    """构造特征模板"""
    feat = {
        'w-1':item[0],
        'w':item[1],
        'w+1':item[2],
        'w-1:w':item[0]+item[1],
        'w:w+1':item[1]+item[2],
        'bias':1
    }
    return feat
    
def extract_feature(sentences):
    """提取特征"""
    features = []
    for sent in sentences:
        feature_of_sent = []
        for item in sent:
            feat = feature_dict(item)
            feature_of_sent.append(feat)
        features.append(feature_of_sent)
    return features

In [None]:
features = extract_feature(sentences)

In [289]:
features[0]

[{'bias': 1,
  'w': '人',
  'w+1': '民',
  'w-1': '<BOS>',
  'w-1:w': '<BOS>人',
  'w:w+1': '人民'},
 {'bias': 1, 'w': '民', 'w+1': '网', 'w-1': '人', 'w-1:w': '人民', 'w:w+1': '民网'},
 {'bias': 1, 'w': '网', 'w+1': '1', 'w-1': '民', 'w-1:w': '民网', 'w:w+1': '网1'},
 {'bias': 1, 'w': '1', 'w+1': '月', 'w-1': '网', 'w-1:w': '网1', 'w:w+1': '1月'},
 {'bias': 1, 'w': '月', 'w+1': '1', 'w-1': '1', 'w-1:w': '1月', 'w:w+1': '月1'},
 {'bias': 1, 'w': '1', 'w+1': '日', 'w-1': '月', 'w-1:w': '月1', 'w:w+1': '1日'},
 {'bias': 1, 'w': '日', 'w+1': '讯', 'w-1': '1', 'w-1:w': '1日', 'w:w+1': '日讯'},
 {'bias': 1, 'w': '讯', 'w+1': '据', 'w-1': '日', 'w-1:w': '日讯', 'w:w+1': '讯据'},
 {'bias': 1, 'w': '据', 'w+1': '《', 'w-1': '讯', 'w-1:w': '讯据', 'w:w+1': '据《'},
 {'bias': 1, 'w': '《', 'w+1': '纽', 'w-1': '据', 'w-1:w': '据《', 'w:w+1': '《纽'},
 {'bias': 1, 'w': '纽', 'w+1': '约', 'w-1': '《', 'w-1:w': '《纽', 'w:w+1': '纽约'},
 {'bias': 1, 'w': '约', 'w+1': '时', 'w-1': '纽', 'w-1:w': '纽约', 'w:w+1': '约时'},
 {'bias': 1, 'w': '时', 'w+1': '报', 'w-1': '约',

In [290]:
train_len = int(len(features)*0.8)
train_len

229016

In [291]:
X_train = features[:train_len]
y_train = new_targets[:train_len]
X_test = features[train_len:]
y_test = new_targets[train_len:]

In [292]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq,yseq in zip(X_train,y_train):
    trainer.append(xseq,yseq)

CPU times: user 4min 43s, sys: 1.8 s, total: 4min 44s
Wall time: 4min 44s


In [293]:
# 参数设置
trainer.set_params({'c1':1.0,'c2':1e-3,'max_iterations':100,'feature.possible_transitions':True})

In [295]:
%%time
trainer.train('../outputs/ner_2014_char_based.pycrfsuite')

CPU times: user 56min 50s, sys: 5.12 s, total: 56min 55s
Wall time: 56min 50s


In [296]:
!ls -lh ./ner_2018_char_based.pycrfsuite

-rw-rw-r-- 1 daizelin daizelin 470K 9月  20 21:22 ./ner_2018_char_based.pycrfsuite


In [297]:
def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset,key = lambda tag:tag.split('-',1)[::-1])
    class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [298]:
%%time
tagger = pycrfsuite.Tagger()
tagger.open('../outputs/ner_2014_char_based.pycrfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 1min 4s, sys: 540 ms, total: 1min 4s
Wall time: 1min 5s


In [299]:
print(bio_classification_report(y_test,y_pred))

             precision    recall  f1-score   support

                  0.00      0.00      0.00         2
      B_LOC       0.97      0.97      0.97     51825
      B_ORG       0.98      0.97      0.98      3687
      B_PER       0.96      0.92      0.94     46640
        B_T       0.98      0.98      0.98     43415
      I_LOC       0.96      0.95      0.96     80188
      I_ORG       0.99      0.96      0.97      8266
      I_PER       0.96      0.91      0.93     90070
        I_T       0.98      0.99      0.98    115917

avg / total       0.99      0.99      0.99   4758408



  'precision', 'predicted', average, warn_for)


In [4]:
def predict(s):
    s = list(s)
    s.insert(0,'<BOS>')
    s.append('<EOS>')
    sent = seg_by_window(s)
#     print(sent)
    features = extract_feature([sent])
#     print(features)
    tagger = pycrfsuite.Tagger()
    tagger.open('ner_2014_char_based.pycrfsuite')
    y_pred = [tagger.tag(features[0])]
#     print(y_pred)
    return y_pred[0]

In [None]:

for c, t in zip(input,y_pred[0]):
    res.append(c+'/'+t)
print(' '.join(res))

In [342]:
sent = '新华社北京9月11日电第二十二届国际检察官联合会年会暨会员代表大会11日上午在北京开幕。国家主席江泽民发来贺信， 对会议召开表示祝贺。'

In [6]:
def run_predcit(sent):
    y = predict(sent)
    res = []
    for c, t in zip(list(sent.strip()),y):
        res.append(c+'/'+t)
    print(' '.join(res))

In [7]:
run_predcit(sent)

下/O 沙/O 世/O 贸/O 江/B_PER 滨/I_PER 花/O 园/O 骏/O 景/O 湾/O 5/O 幢/O 与/O 6/O 幢/O 之/O 间/O


In [350]:
sent2 = '1949年，她还曾到“华大”向戏剧系同志学习，也能和解放区的文艺工作者打成一片。'

In [351]:
run_predcit(sent2)

1/B_T 9/I_T 4/I_T 9/I_T 年/I_T ，/O 她/O 还/O 曾/O 到/O “/O 华/O 大/O ”/O 向/O 戏/O 剧/O 系/O 同/O 志/O 学/O 习/O ，/O 也/O 能/O 和/O 解/B_LOC 放/I_LOC 区/I_LOC 的/O 文/O 艺/O 工/O 作/O 者/O 打/O 成/O 一/O 片/O 。/O
