In [202]:
import CRFPP
import pandas as pd
import numpy as np
import operator
import re

Step1:获取文本数据

In [203]:
## 读入CoNLL Corpus
def read_CoNLL(filename):
    docs = []
    word = []
    POS = []
    chunking = []
    NE = []
    doc = []
    
    f = open(filename,encoding = 'gbk')
    lines = f.readlines()
    f.close()
    
    i = 0
    for line in lines:
        line = line.replace('\n','')
        if line == '-DOCSTART- -X- O O':           
            if i != 0:
                docs.append(doc)
                doc = []
            doc.append('-DOCSTART-')
        else:
            if line != '':
                doc.append(line.split(' ')[0])
            else:
                doc.append('_space')
        
        
        if (line != ''):
            labels = line.split(' ')
            word.append(labels[0])
            POS.append(labels[1])
            chunking.append(labels[2])
            NE.append(labels[3])
        else:
            word.append('_space')
            POS.append('_space')
            chunking.append('_space')
            NE.append('_space')
        
        i += 1
    docs.append(doc)

    df = pd.DataFrame(data = np.transpose(np.array([word,POS,chunking,NE])), columns = ['word','POS','chunking','NE'])
    return df, docs


In [204]:
df_train,docs_train = read_CoNLL('eng.train')
df_test,docs_test = read_CoNLL('eng.testa')

In [205]:
## 常用词列表（至少在五个文档中出现过的词汇）
def create_vocab(data):
    vocab = {}
    if type(data) == pd.core.frame.DataFrame:       
        keys = set(data['word'])
        for key in keys:
            vocab[key] = data['word'].tolist().count(key)
    else:
        keys = []
        for d in data:
            if d not in keys:
                keys.append(d)
        for key in keys:
            if type(key) == list:
                vocab[[(x,y) for x,y in [key]][0]] = data.count(key)
            else:
                vocab[key] = data.count(key)
    return vocab
        

In [206]:
vocab = create_vocab(df_train)

common_vocab = []
for word in list(vocab.keys()):
    n = 0
    for doc in docs_train:
        if word in doc:
            n += 1
    
    if n >= 5:
        common_vocab.append(word)
        

Step2:抽取基本特征相关的词列表

In [207]:
## 一元相关性模型：（N+ - N-）/(N+ * N-)^0.5
def unigram_correlation(df,label):
    word_list = [df.loc[i-1][0] for i in range(1,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(word_list))/(val*(len(word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(word_list).keys()),list(create_vocab(word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]
        

In [208]:
uni_co_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        uni_co_train[label] = unigram_correlation(df_train,label)


In [209]:
uni_co_train['I-LOC']

[['_space', -1.5737791036443216],
 ['(', -1.6706290926962695],
 ['in', -3.3276962395907033],
 ['of', -4.296491617348425],
 [',', -4.471963064195109],
 ['the', -4.627458172020232],
 ['to', -6.503940668982656],
 ['and', -6.807502430723574],
 ['-', -8.310606149789805],
 ['at', -8.348537942952419],
 ['AT', -8.845770951417105],
 ['from', -8.983816697866123],
 ['New', -9.128204403591983],
 ['NEW', -10.51263779132837],
 ['South', -10.587346661068436],
 ['In', -11.070377493393803],
 ['--', -11.43033000237937],
 ['The', -11.525763194434314],
 ['United', -11.62355454266102],
 ['Czech', -12.383283438574125]]

In [210]:
## 二元相关性模型：
def bigram_correlation(df,label):
    word_list = [[df.loc[i-2][0],df.loc[i-1][0]] for i in range(2,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(word_list))/(val*(len(word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(word_list).keys()),list(create_vocab(word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]

In [211]:
bi_co_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        bi_co_train[label] = bigram_correlation(df_train,label)

In [212]:
bi_co_train['I-LOC']

[[('.', '_space'), -2.260715479119852],
 [('-DOCSTART-', '_space'), -6.85099052540913],
 [('in', 'the'), -10.985425548656982],
 [('_space', 'In'), -11.157266542466408],
 [('_space', 'NEW'), -11.43033000237937],
 [('_space', '--'), -11.525763194434314],
 [('_space', '('), -12.88827288788054],
 [('of', 'the'), -12.88827288788054],
 [('the', 'United'), -13.309228000905287],
 [('_space', 'At'), -13.61333938829869],
 [('town', 'of'), -14.864377128504293],
 [('_space', 'The'), -15.28876350919457],
 [('(', 'South'), -15.514804047460249],
 [('(', 'Czech'), -15.514804047460249],
 [('1996-08-28', '_space'), -16.52883936259703],
 [('at', 'the'), -16.814472022175316],
 [('CRICKET', '-'), -16.814472022175316],
 [('city', 'of'), -17.432463832402867],
 [('_space', '-'), -18.12295328982313],
 [(')', '_space'), -18.12295328982313]]

In [213]:
## 词后缀列表：
def NE_suffix(df,label):
    suffix_list = [df.loc[i][0][-3:-1]+df.loc[i][0][-1] for i in range(0,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(suffix_list))/(val*(len(suffix_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(suffix_list).keys()),list(create_vocab(suffix_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:50]

In [214]:
suffix_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        suffix_train[label] = NE_suffix(df_train,label)

In [215]:
suffix_train['I-LOC']

[['and', -4.522322603158466],
 ['.S.', -4.884082377482271],
 ['ain', -5.529671646651143],
 ['any', -7.306528827371882],
 ['nia', -7.525192507858331],
 ['lia', -7.641615476822893],
 ['ina', -7.826130242363489],
 ['sia', -7.826130242363489],
 ['nce', -7.858117203586373],
 ['ria', -7.956334249890998],
 ['ica', -8.199677789140186],
 ['tan', -8.586928723402199],
 ['aly', -9.031217387333442],
 ['New', -9.177828854319243],
 ['DON', -9.228233860749807],
 ['ong', -9.228233860749807],
 ['pan', -9.604783237103666],
 ['den', -9.84079779868143],
 ['uth', -9.902455748422403],
 ['dia', -10.29734931201734],
 ['ted', -10.43943508230555],
 ['ium', -10.43943508230555],
 ['NEW', -10.51263779132837],
 ['raq', -10.902339336003102],
 ['nds', -10.902339336003102],
 ['ORK', -10.985425548656982],
 ['est', -11.070377493393803],
 ['nya', -11.070377493393803],
 ['don', -11.157266542466408],
 ['ton', -11.157266542466408],
 ['lic', -11.246167953573321],
 ['ael', -11.337161146201318],
 ['tes', -11.826615110746435],
 

In [216]:
## 命名实体后缀列表：
def NE_word_suffix(df,label):
    NE_suffix_list = []
    new_label = label.split('-')
    for i in range(len(df)):        
        if (df.loc[i][3] != 'O' and df.loc[i][3] != '_space'):
            if (i != len(df)-1):
                if (df.loc[i][3].split('-')[1] == new_label[1] and df.loc[i+1][3] != label):
                    NE_suffix_list.append(df.loc[i][0])
            else:
                if (df.loc[i][3].split('-')[1] == new_label[1]):
                    NE_suffix_list.append(df.loc[i][0])
    
    correlation_list = [[key,(val*2-len(NE_suffix_list))/(val*(len(NE_suffix_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(NE_suffix_list).keys()),list(create_vocab(NE_suffix_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:50]

In [217]:
NE_word_suffix_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space' and label != 'B-LOC' and label != 'B-ORG' and label != 'B-PER' and label != 'B-MISC'):
        NE_word_suffix_train[label] = NE_word_suffix(df_train,label)

In [218]:
NE_word_suffix_train['I-PER']

[['Clinton', -8.245115032387872],
 ['Arafat', -11.72315051429123],
 ['Yeltsin', -11.72315051429123],
 ['Dole', -12.12461140788624],
 ['Lebed', -12.415663569902774],
 ['Ahmed', -12.56906015743073],
 ['Akram', -13.243235775286516],
 ['Dutroux', -13.429000507587448],
 ['Younis', -15.255073960997647],
 ['Croft', -15.255073960997647],
 ['Mullally', -15.83820195656401],
 ['Netanyahu', -16.15556295630695],
 ['Khan', -16.15556295630695],
 ['Martin', -16.15556295630695],
 ['Sohail', -17.233725429434745],
 ['Johnson', -17.233725429434745],
 ['Teresa', -17.643317521253234],
 ['Lewis', -18.083156657703075],
 ['Medvedev', -18.083156657703075],
 ['Meri', -18.55716988289701],
 ['Rubin', -19.070037542225247],
 ['Christie', -19.070037542225247],
 ['Anwar', -19.62738989775],
 ['Cork', -19.62738989775],
 ['Philippoussis', -19.62738989775],
 ['Stich', -19.62738989775],
 ['Chang', -20.236070341914424],
 ['Sampras', -20.236070341914424],
 ['Williams', -20.236070341914424],
 ['Graf', -20.236070341914424],
 [

In [219]:
## 功能词汇（命名实体中出现的小写词汇）
def function_word(df,label):
    function_word_list = [df.loc[i][0] for i in range(0,len(df)) if (df.loc[i][3] == label and df.loc[i][0].islower() == True)]
    correlation_list = [[key,(val*2-len(function_word_list))/(val*(len(function_word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(function_word_list).keys()),list(create_vocab(function_word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]
    


In [220]:
function_word_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space' and label != 'B-LOC' and label != 'B-ORG' and label != 'B-PER' and label != 'B-MISC'):
        function_word_train[label] = function_word(df_train,label)

Step3：局部特征选取

In [221]:
## POS tag
tag2index = {}
index2tag = {}
n = 2
tag2index['punc'] = 1
index2tag[1] = 'punc'
tag2index['unk'] = 0
index2tag[0] = 'unk'
for item in set(df_train['POS']):
    if (item.isupper() == True):
        tag2index[item] = n
        index2tag[n] = item
        n += 1
    
        

In [222]:
def POS_tagger(df):
    tag_list =[]
    for tag in df['POS']:
        if tag in list(tag2index.keys()):
            tag_list.append(tag2index[tag])
        else:
            if tag.isupper() == False:
                tag_list.append(tag2index['punc'])
            else:
                tag_list.append(tag2index['unk'])
    return tag_list

In [223]:
POS_tag_train = POS_tagger(df_train)

In [224]:
POS_tag_test = POS_tagger(df_test)

In [225]:
## 句首词，大小写，文档的位置（四个参数：首字母大写与否、是不是句首词、全大写与否、文档的位置（头文件:1，标题:2，信息:3，文本:4））
def first_capital(df):
    tag_list = []
    for word in df['word']:
        if word[0].isupper():
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [226]:
def first_word(df):
    tag_list = []
    for i in range(len(df)):
        if (i>0):
            if (df['word'][i-1] == '_space' or df['word'][i-1] == '.' or df['word'][i-1] == '!' or df['word'][i-1] == '?'):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(1)
    return tag_list

In [227]:
def all_capitals(df):
    tag_list = []
    for word in df['word']:
        if word.isupper():
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list
        

In [228]:
def document_locations(df,docs):
    tag_list = []
    docs_lens = []
    docs_segments = []
    for doc in docs:
        split_points = []
        i = 0
        while (i < 3):
            for n in range(len(doc)):
                if doc[n] == '_space':
                    split_points.append(n)
                    i += 1
        docs_segments.append(split_points)
        docs_lens.append(len(doc))
        
    
    num_words = 0
    for j in range(len(docs_lens)):
        check_words = df['word'][num_words:num_words+docs_lens[j]]
        for cw in range(len(check_words)):
            if (cw <= docs_segments[j][0]):
                tag_list.append(1)
            elif (cw > docs_segments[j][0] and cw <= docs_segments[j][1]):
                tag_list.append(2)
            elif (cw > docs_segments[j][1] and cw <= docs_segments[j][2]):
                tag_list.append(3)
            else:
                tag_list.append(4)
        num_words += docs_lens[j]
    
    if (len(tag_list) == len(df)):
        return tag_list
    else:
        print('error')
        return

In [229]:
first_cap_train = first_capital(df_train)
first_word_train = first_word(df_train)
all_capitals_train = all_capitals(df_train)
doc_loc_train = document_locations(df_train,docs_train)

In [230]:
first_cap_test = first_capital(df_test)
first_word_test = first_word(df_test)
all_capitals_test = all_capitals(df_test)
doc_loc_test = document_locations(df_test,docs_test)

In [231]:
## 语言符号信息 （主要包括. , / % $ -） 以及数字信息
def symbal_information(df):
    tag_list = []
    for word in df['word']:
        if ('.' in word or ',' in word or '/' in word or '%' in word or '$' in word or '-' in word):
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [232]:
def number_information(df):
    tag_list = []
    for word in df['word']:
        if (re.search('\d',word) != None):
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [233]:
symbal_infor_train = symbal_information(df_train)
number_infor_train = number_information(df_train)

In [234]:
symbal_infor_test = symbal_information(df_test)
number_infor_test = number_information(df_test)

In [235]:
## 引号，括号信息
def filled_information(df):
    li_1 = []
    li_2 = []
    li_3 = []
    li_4 = []
    li_5 = []
    n_1 = 0
    n_2 = 0
    n_3 = 0
    n_4 = 0
    n_5 = 0
    for word in df['word']:
        if (word == '('):
            n_1 = 1
        if (word == ')'):
            n_1 = 0
        if (word == '['):
            n_2 = 1
        if (word == ']'):
            n_2 = 0
        if (word == '{'):
            n_3 = 1
        if (word == '}'):
            n_3 =0
        if (word == "'"):
            n_4 += 1
        if (word == '"'):
            n_5 += 1
        li_1.append(n_1)
        li_2.append(n_2)
        li_3.append(n_3)
        li_4.append(n_4%2)
        li_5.append(n_5%2)
        
    tag_list = [int(x) for x in list(np.array(li_1)+np.array(li_2)+np.array(li_3)+np.array(li_4)+np.array(li_5))]
    
    return tag_list
        
    
    

In [236]:
fil_infor_train = filled_information(df_train)

In [237]:
fil_infor_test = filled_information(df_test)

In [238]:
## 常用词列表
def common_word(df):
    tag_list = []
    for word in df['word']:
        if word not in common_vocab:
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [239]:
common_word_train =  common_word(df_train)

In [240]:
common_word_test =  common_word(df_test)

In [241]:
## 二元特征提取
def bi_feature(df,label):
    tag_list = []
    tag_list.append(0)
    tag_list.append(0)
    for i in range(2,len(df)):
        if (df['NE'][i] != 'O' and df['NE'][i] != '_space'):
            if (df['NE'][i].split('-')[1] == label and (df['word'][i-2],df['word'][i-1]) in [x for [x,y] in bi_co_train['I-'+label]]):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    return tag_list
        

In [242]:
bi_LOC_train = bi_feature(df_train,'LOC')
bi_PER_train = bi_feature(df_train,'PER')
bi_ORG_train = bi_feature(df_train,'ORG')
bi_MISC_train = bi_feature(df_train,'MISC')

In [243]:
bi_LOC_test = bi_feature(df_test,'LOC')
bi_PER_test = bi_feature(df_test,'PER')
bi_ORG_test = bi_feature(df_test,'ORG')
bi_MISC_test = bi_feature(df_test,'MISC')

In [244]:
## 词后缀特征
def suffix_feature(df,label):
    tag_list = []
    for i in range(len(df)):
        if (df['NE'][i] != 'O' and df['NE'][i] != '_space'):
            if (df['NE'][i].split('-')[1] == label and df['word'][i][-3:-1]+df['word'][i][-1] in [x for [x,y] in suffix_train['I-'+label]]):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    return tag_list

In [245]:
suffix_LOC_train = suffix_feature(df_train,'LOC')
suffix_PER_train = suffix_feature(df_train,'PER')
suffix_ORG_train = suffix_feature(df_train,'ORG')
suffix_MISC_train = suffix_feature(df_train,'MISC')

In [246]:
suffix_LOC_test = suffix_feature(df_test,'LOC')
suffix_PER_test = suffix_feature(df_test,'PER')
suffix_ORG_test = suffix_feature(df_test,'ORG')
suffix_MISC_test = suffix_feature(df_test,'MISC')

In [247]:
## 类别后缀特征(一元特征)
def c_suffix_feature(df,label):
    tag_list = []
    for i in range(len(df)-1):
        if (df['NE'][i+1] != 'O' and df['NE'][i+1] != '_space'):
            if (df['NE'][i+1].split('-')[1] == label and df['word'][i+1][0].isupper() == True and df['word'][i][0].isupper() == True):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    tag_list.append(0)
    return tag_list

In [248]:
csuffix_LOC_train = c_suffix_feature(df_train,'LOC')
csuffix_PER_train = c_suffix_feature(df_train,'PER')
csuffix_ORG_train = c_suffix_feature(df_train,'ORG')
csuffix_MISC_train = c_suffix_feature(df_train,'MISC')

In [249]:
csuffix_LOC_test = c_suffix_feature(df_test,'LOC')
csuffix_PER_test = c_suffix_feature(df_test,'PER')
csuffix_ORG_test = c_suffix_feature(df_test,'ORG')
csuffix_MISC_test = c_suffix_feature(df_test,'MISC')

In [250]:
## 功能词汇特征
def functional_word_train(df,label):
    tag_list = []
    for word in df['word']:
        if word in [x for [x,y] in function_word_train[label]]:
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [251]:
functional_LOC_train = functional_word_train(df_train,'I-LOC')
functional_PER_train = functional_word_train(df_train,'I-PER')
functional_ORG_train = functional_word_train(df_train,'I-ORG')
functional_MISC_train = functional_word_train(df_train,'I-MISC')

In [252]:
functional_LOC_test = functional_word_train(df_test,'I-LOC')
functional_PER_test = functional_word_train(df_test,'I-PER')
functional_ORG_test = functional_word_train(df_test,'I-ORG')
functional_MISC_test = functional_word_train(df_test,'I-MISC')

Step4：全局特征选取

In [253]:
## 全局一元特征 
def global_uni_feature(docs,label):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [a-1 for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove(j-1)
            is_uni = 0
            for ind in inds:
                if docs[i][ind] in [x for [x,_] in uni_co_train[label]]:
                    is_uni = 1
            tag_list.append(is_uni)     
            n += 1
    return tag_list

In [254]:
guni_LOC_train = global_uni_feature(docs_train,'I-LOC')
guni_PER_train = global_uni_feature(docs_train,'I-PER')
guni_ORG_train = global_uni_feature(docs_train,'I-ORG')
guni_MISC_train = global_uni_feature(docs_train,'I-MISC')

In [255]:
guni_LOC_test = global_uni_feature(docs_test,'I-LOC')
guni_PER_test = global_uni_feature(docs_test,'I-PER')
guni_ORG_test = global_uni_feature(docs_test,'I-ORG')
guni_MISC_test = global_uni_feature(docs_test,'I-MISC')

In [256]:
## 全局二元特征 
def global_bi_feature(docs,label):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [(a-2,a-1) for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove((j-2,j-1))
            is_bi = 0
            for ind in inds:
                if (docs[i][ind[0]],docs[i][ind[1]]) in [x for [x,_] in bi_co_train[label]]:
                    is_bi = 1
            tag_list.append(is_bi)     
            n += 1
    return tag_list

In [257]:
gbi_LOC_train = global_bi_feature(docs_train,'I-LOC')
gbi_PER_train = global_bi_feature(docs_train,'I-PER')
gbi_ORG_train = global_bi_feature(docs_train,'I-ORG')
gbi_MISC_train = global_bi_feature(docs_train,'I-MISC')

In [258]:
gbi_LOC_test = global_bi_feature(docs_test,'I-LOC')
gbi_PER_test = global_bi_feature(docs_test,'I-PER')
gbi_ORG_test = global_bi_feature(docs_test,'I-ORG')
gbi_MISC_test = global_bi_feature(docs_test,'I-MISC')

In [259]:
## 全局词后缀特征 
def global_csuffix_feature(docs,label,csuffix):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [a-j for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove(0)
            is_suf = 0
            for ind in inds:
                if csuffix[n+ind] == 1:
                    is_suf = 1
            tag_list.append(is_suf)     
            n += 1
    return tag_list

In [260]:
gcsuf_LOC_train = global_csuffix_feature(docs_train,'I-LOC',csuffix_LOC_train)
gcsuf_PER_train = global_csuffix_feature(docs_train,'I-PER',csuffix_PER_train)
gcsuf_ORG_train = global_csuffix_feature(docs_train,'I-ORG',csuffix_ORG_train)
gcsuf_MISC_train = global_csuffix_feature(docs_train,'I-MISC',csuffix_MISC_train)

In [261]:
gcsuf_LOC_test = global_csuffix_feature(docs_test,'I-LOC',csuffix_LOC_test)
gcsuf_PER_test = global_csuffix_feature(docs_test,'I-PER',csuffix_PER_test)
gcsuf_ORG_test = global_csuffix_feature(docs_test,'I-ORG',csuffix_ORG_test)
gcsuf_MISC_test = global_csuffix_feature(docs_test,'I-MISC',csuffix_MISC_test)

Step5：训练CRF模型

In [262]:
## 标签
NE_list = {}
NE_list['O'] = 0
NE_list['_space'] = 1
NE_list['B-LOC'] = 2
NE_list['I-LOC'] = 3
NE_list['B-PER'] = 4
NE_list['I-PER'] = 5
NE_list['B-ORG'] = 6
NE_list['I-ORG'] = 7
NE_list['B-MISC'] = 8
NE_list['I-MISC'] = 9
def output():
    NE_train = []
    NE_test = []
    for tag in df_train['NE']:
        NE_train.append(NE_list[tag])
    for tag_t in df_test['NE']:
        NE_test.append(NE_list[tag_t])
    return NE_train,NE_test

In [263]:
y_train, y_test = output()

In [264]:
train_input = pd.DataFrame(data = np.transpose(np.array([list(df_train['word']),
POS_tag_train,
symbal_infor_train,
number_infor_train,
first_cap_train,
first_word_train,
all_capitals_train,
doc_loc_train,
fil_infor_train,
common_word_train,
bi_LOC_train,
bi_PER_train, 
bi_ORG_train, 
bi_MISC_train,
suffix_LOC_train,
suffix_PER_train,
suffix_ORG_train, 
suffix_MISC_train,
csuffix_LOC_train,
csuffix_PER_train,
csuffix_ORG_train,
csuffix_MISC_train,
functional_LOC_train,
functional_PER_train,
functional_ORG_train,
functional_MISC_train,
guni_LOC_train,
guni_PER_train,
guni_ORG_train,
guni_MISC_train,
gbi_LOC_train,
gbi_PER_train,
gbi_ORG_train,
gbi_MISC_train,
gcsuf_LOC_train,
gcsuf_PER_train,
gcsuf_ORG_train,
gcsuf_MISC_train,
list(df_train['NE'])])))

test_input = pd.DataFrame(data = np.transpose(np.array([list(df_test['word']),
POS_tag_test,
symbal_infor_test,
number_infor_test,
first_cap_test,
first_word_test,
all_capitals_test,
doc_loc_test,
fil_infor_test,
common_word_test,
bi_LOC_test,
bi_PER_test, 
bi_ORG_test, 
bi_MISC_test,
suffix_LOC_test,
suffix_PER_test,
suffix_ORG_test,
suffix_MISC_test,
csuffix_LOC_test,
csuffix_PER_test,
csuffix_ORG_test,
csuffix_MISC_test,
functional_LOC_test,
functional_PER_test,
functional_ORG_test,
functional_MISC_test,
guni_LOC_test,
guni_PER_test,
guni_ORG_test,
guni_MISC_test,
gbi_LOC_test,
gbi_PER_test,
gbi_ORG_test,
gbi_MISC_test,
gcsuf_LOC_test,
gcsuf_PER_test,
gcsuf_ORG_test,
gcsuf_MISC_test,
list(df_test['NE'])])))

In [265]:
train_words = list(np.array(train_input.iloc[:,0]))

train_features_o = list(np.array(train_input.iloc[:,1:38]))
train_features = []
for i in range(len(train_features_o)):
    train_f = []
    for j in range(len(train_features_o[i])):
        if (i == 0):
            train_f.append(u'-1:f'+str(j)+u'=0')
            train_f.append(u'f'+str(j)+u'='+str(train_features_o[i][j]))
            train_f.append(u'+1:f'+str(j)+u'='+str(train_features_o[i+1][j]))
        elif (i == len(train_features_o)-1):
            train_f.append(u'-1:f'+str(j)+u'='+str(train_features_o[i-1][j]))
            train_f.append(u'f'+str(j)+u'='+str(train_features_o[i][j]))
            train_f.append(u'+1:f'+str(j)+u'=0')   
        else:
            train_f.append(u'-1:f'+str(j)+u'='+str(train_features_o[i-1][j]))
            train_f.append(u'f'+str(j)+u'='+str(train_features_o[i][j]))
            train_f.append(u'+1:f'+str(j)+u'='+str(train_features_o[i+1][j]))  
    train_features.append(train_f)

train_labels = list(np.array(train_input.iloc[:,38]))

In [266]:
test_words = list(np.array(test_input.iloc[:,0]))

test_features_o = list(np.array(test_input.iloc[:,1:38]))
test_features = []
for i in range(len(test_features_o)):
    test_f = []
    for j in range(len(test_features_o[i])):
        if (i == 0):
            test_f.append(u'-1:f'+str(j)+u'=0')
            test_f.append(u'f'+str(j)+u'='+str(test_features_o[i][j]))
            test_f.append(u'+1:f'+str(j)+u'='+str(test_features_o[i+1][j]))
        elif (i == len(test_features_o)-1):
            test_f.append(u'-1:f'+str(j)+u'='+str(test_features_o[i-1][j]))
            test_f.append(u'f'+str(j)+u'='+str(test_features_o[i][j]))
            test_f.append(u'+1:f'+str(j)+u'=0')   
        else:
            test_f.append(u'-1:f'+str(j)+u'='+str(test_features_o[i-1][j]))
            test_f.append(u'f'+str(j)+u'='+str(test_features_o[i][j]))
            test_f.append(u'+1:f'+str(j)+u'='+str(test_features_o[i+1][j]))  
    test_features.append(test_f)

test_labels = list(np.array(test_input.iloc[:,38]))

In [267]:
def get_input(words,features,labels):
    x = []
    y = []
    x_sents = []
    y_sents = []
    for i in range(len(words)):
        if (words[i] == '_space'):
            x_sents.append([words[i]]+list(features[i]))
            y_sents.append(labels[i])               
            x.append(x_sents)
            y.append(y_sents)
            x_sents = []
            y_sents = []            
        else:
            x_sents.append([words[i]]+list(features[i]))
            y_sents.append(labels[i])
    x.append(x_sents)
    y.append(y_sents)
    
    return x,y

In [268]:
x_train, y_train = get_input(train_words,train_features,train_labels)

In [269]:
x_test, y_test = get_input(test_words,test_features,test_labels)

In [270]:
## 训练CRF模型
trainer = pycrfsuite.Trainer(verbose=False)

In [271]:
for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

In [272]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [273]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [274]:
%%time
trainer.train('CRF.crfsuite')

CPU times: user 53.8 s, sys: 234 ms, total: 54.1 s
Wall time: 54.1 s


In [275]:
trainer.logparser.last_iteration

{'active_features': 3359,
 'error_norm': 1502.594098,
 'feature_norm': 78.716283,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 8484.519646,
 'num': 50,
 'scores': {},
 'time': 0.814}

Step6:测试CRF模型

In [276]:
tagger = pycrfsuite.Tagger()
tagger.open('CRF.crfsuite')

<contextlib.closing at 0x1a38c92da0>

In [277]:
example_sent = x_test[15]
print("Predicted:", ' '.join(tagger.tag(example_sent)))
print("Correct:  ", ' '.join(y_test[15]))

Predicted: O O O O O O O O O I-PER I-PER O I-MISC O O I-PER I-PER O O O O O _space
Correct:   O O O O O O O O O I-PER I-PER O I-MISC O O I-PER I-PER O O O O O _space


CRFsuite的使用方法:需要对每个句子进行输入

In [278]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'} - {'_space'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [279]:
%%time
y_pred = [tagger.tag(xseq) for xseq in x_test]

CPU times: user 2.04 s, sys: 6.76 ms, total: 2.05 s
Wall time: 2.04 s


In [280]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      I-LOC       0.94      0.94      0.94      2094
     B-MISC       0.00      0.00      0.00         4
     I-MISC       0.98      0.90      0.94      1264
      I-ORG       0.96      0.92      0.94      2092
      I-PER       0.97      0.97      0.97      3149

avg / total       0.96      0.94      0.95      8603

