In [1]:
import CRFPP
import pandas as pd
import numpy as np
import operator
import re

Step1:获取文本数据

In [380]:
## 读入CoNLL Corpus
def read_CoNLL(filename):
    docs = []
    word = []
    POS = []
    chunking = []
    NE = []
    doc = []
    
    f = open(filename,encoding = 'gbk')
    lines = f.readlines()
    f.close()
    
    i = 0
    for line in lines:
        line = line.replace('\n','')
        if line == '-DOCSTART- -X- O O':           
            if i != 0:
                docs.append(doc)
                doc = []
            doc.append('-DOCSTART-')
        else:
            if line != '':
                doc.append(line.split(' ')[0])
            else:
                doc.append('_space')
        
        
        if (line != ''):
            labels = line.split(' ')
            word.append(labels[0])
            POS.append(labels[1])
            chunking.append(labels[2])
            NE.append(labels[3])
        else:
            word.append('_space')
            POS.append('_space')
            chunking.append('_space')
            NE.append('_space')
        
        i += 1
    docs.append(doc)

    df = pd.DataFrame(data = np.transpose(np.array([word,POS,chunking,NE])), columns = ['word','POS','chunking','NE'])
    return df, docs


In [398]:
df_train,docs_train = read_CoNLL('eng.train')
df_test,docs_test = read_CoNLL('eng.testa')

In [399]:
docs_train = docs_train[:30]
docs_test = docs_test[:5]

In [400]:
n_train = 0
for i in range(len(docs_train)):
    for j in range(len(docs_train[i])):
        n_train += 1

n_test = 0
for i in range(len(docs_test)):
    for j in range(len(docs_test[i])):
        n_test += 1

In [401]:
df_train = df_train[:n_train]
df_test = df_test[:n_test]

In [404]:
## 常用词列表（至少在五个文档中出现过的词汇）
def create_vocab(data):
    vocab = {}
    if type(data) == pd.core.frame.DataFrame:       
        keys = set(data['word'])
        for key in keys:
            vocab[key] = data['word'].tolist().count(key)
    else:
        keys = []
        for d in data:
            if d not in keys:
                keys.append(d)
        for key in keys:
            if type(key) == list:
                vocab[[(x,y) for x,y in [key]][0]] = data.count(key)
            else:
                vocab[key] = data.count(key)
    return vocab
        

In [405]:
vocab = create_vocab(df_train)

common_vocab = []
for word in list(vocab.keys()):
    n = 0
    for doc in docs_train:
        if word in doc:
            n += 1
    
    if n >= 5:
        common_vocab.append(word)
        

Step2:抽取基本特征相关的词列表

In [408]:
## 一元相关性模型：（N+ - N-）/(N+ * N-)^0.5
def unigram_correlation(df,label):
    word_list = [df.loc[i-1][0] for i in range(1,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(word_list))/(val*(len(word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(word_list).keys()),list(create_vocab(word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]
        

In [409]:
uni_co_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        uni_co_train[label] = unigram_correlation(df_train,label)


In [410]:
uni_co_train['I-LOC']

[['_space', -1.5343265552504246],
 ['in', -2.931521304691667],
 ['the', -3.1032797684331617],
 ['to', -4.09428268714651],
 ['(', -4.09428268714651],
 ['and', -4.962694350081032],
 ['with', -4.962694350081032],
 ['from', -5.262019756089627],
 ['West', -5.613835722137896],
 ['of', -6.557109247067381],
 ['/', -7.223568439694461],
 ['-', -7.223568439694461],
 ['that', -8.121212121212121],
 ['on', -8.121212121212121],
 ['northern', -9.428808992889305],
 ['between', -9.428808992889305],
 ['Abu', -9.428808992889305],
 ['by', -11.608677113915903],
 ['says', -11.608677113915903],
 ['accused', -11.608677113915903]]

In [411]:
## 二元相关性模型：
def bigram_correlation(df,label):
    word_list = [[df.loc[i-2][0],df.loc[i-1][0]] for i in range(2,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(word_list))/(val*(len(word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(word_list).keys()),list(create_vocab(word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]

In [412]:
bi_co_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        bi_co_train[label] = bigram_correlation(df_train,label)

In [413]:
bi_co_train['I-LOC']

[[('.', '_space'), -2.242843314993112],
 [('the', 'West'), -6.036161376351597],
 [('1996-08-22', '_space'), -6.557109247067381],
 [('to', 'the'), -6.557109247067381],
 [('-DOCSTART-', '_space'), -7.223568439694461],
 [('in', 'the'), -7.223568439694461],
 [('talks', 'with'), -9.428808992889305],
 [('"', '_space'), -9.428808992889305],
 [('DIGEST', '-'), -9.428808992889305],
 [('reports', 'from'), -11.608677113915903],
 [('Britain', 'and'), -11.608677113915903],
 [('sheep', 'from'), -11.608677113915903],
 [('visit', 'to'), -11.608677113915903],
 [('negotiator', 'with'), -11.608677113915903],
 [('SALE', 'LIMITS'), -11.608677113915903],
 [('US', '/'), -11.608677113915903],
 [('UK', '/'), -11.608677113915903],
 [('the', 'United'), -11.608677113915903],
 [('support', 'to'), -11.608677113915903],
 [('in', 'northern'), -11.608677113915903]]

In [414]:
## 词后缀列表：
def NE_suffix(df,label):
    suffix_list = [df.loc[i][0][-3:-1]+df.loc[i][0][-1] for i in range(0,len(df)) if df.loc[i][3] == label]
    correlation_list = [[key,(val*2-len(suffix_list))/(val*(len(suffix_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(suffix_list).keys()),list(create_vocab(suffix_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:50]

In [415]:
suffix_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space'):
        suffix_train[label] = NE_suffix(df_train,label)

In [416]:
suffix_train['I-LOC']

[['raq', -3.7826047922906163],
 ['ael', -3.9310970484186476],
 ['ria', -4.962694350081032],
 ['ain', -5.262019756089627],
 ['wan', -5.613835722137896],
 ['dad', -5.613835722137896],
 ['est', -5.613835722137896],
 ['ank', -5.613835722137896],
 ['any', -6.036161376351597],
 ['ina', -6.557109247067381],
 ['tan', -6.557109247067381],
 ['cus', -7.223568439694461],
 ['bya', -7.223568439694461],
 ['ran', -7.223568439694461],
 ['aza', -7.223568439694461],
 ['UAE', -7.223568439694461],
 ['nce', -8.121212121212121],
 ['DON', -8.121212121212121],
 ['cow', -8.121212121212121],
 ['har', -8.121212121212121],
 ['non', -8.121212121212121],
 ['pan', -8.121212121212121],
 ['pei', -9.428808992889305],
 ['ait', -9.428808992889305],
 ['LEM', -9.428808992889305],
 ['ton', -9.428808992889305],
 ['lan', -9.428808992889305],
 ['and', -9.428808992889305],
 ['DAD', -9.428808992889305],
 ['key', -9.428808992889305],
 ['lah', -9.428808992889305],
 ['Abu', -9.428808992889305],
 ['abi', -9.428808992889305],
 ['sia',

In [417]:
## 命名实体后缀列表：
def NE_word_suffix(df,label):
    NE_suffix_list = []
    new_label = label.split('-')
    for i in range(len(df)):        
        if (df.loc[i][3] != 'O' and df.loc[i][3] != '_space'):
            if (i != len(df)-1):
                if (df.loc[i][3].split('-')[1] == new_label[1] and df.loc[i+1][3] != label):
                    NE_suffix_list.append(df.loc[i][0])
            else:
                if (df.loc[i][3].split('-')[1] == new_label[1]):
                    NE_suffix_list.append(df.loc[i][0])
    
    correlation_list = [[key,(val*2-len(NE_suffix_list))/(val*(len(NE_suffix_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(NE_suffix_list).keys()),list(create_vocab(NE_suffix_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:50]

In [418]:
NE_word_suffix_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space' and label != 'B-LOC' and label != 'B-ORG' and label != 'B-PER' and label != 'B-MISC'):
        NE_word_suffix_train[label] = NE_word_suffix(df_train,label)

In [419]:
NE_word_suffix_train['I-PER']

[['Arafat', -3.202647210128267],
 ['Peres', -4.212676001067371],
 ['Fischler', -4.602654792369785],
 ['Zhirinovsky', -4.602654792369785],
 ['Hendrix', -5.0988796285533615],
 ['Netanyahu', -5.0988796285533615],
 ['Simitis', -5.763633318189896],
 ['Skandalidis', -5.763633318189896],
 ['Levy', -6.726681467091999],
 ['Saddam', -6.726681467091999],
 ['Hussein', -6.726681467091999],
 ['Zwingmann', -8.322966839023469],
 ['Jones', -8.322966839023469],
 ['Shubei', -8.322966839023469],
 ['Tang', -8.322966839023469],
 ['Kontogiannis', -8.322966839023469],
 ['Rabinovich', -8.322966839023469],
 ['Awdankiewicz', -8.322966839023469],
 ['Rdainah', -8.322966839023469],
 ['Rabbani', -8.322966839023469],
 ['Hariri', -8.322966839023469],
 ['Blackburn', -11.875503619084816],
 ['Pas', -11.875503619084816],
 ['Palacio', -11.875503619084816],
 ['Etchingham', -11.875503619084816],
 ['Chan', -11.875503619084816],
 ['Guofang', -11.875503619084816],
 ['Lien', -11.875503619084816],
 ['Dominion', -11.87550361908481

In [420]:
## 功能词汇（命名实体中出现的小写词汇）
def function_word(df,label):
    function_word_list = [df.loc[i][0] for i in range(0,len(df)) if (df.loc[i][3] == label and df.loc[i][0].islower() == True)]
    correlation_list = [[key,(val*2-len(function_word_list))/(val*(len(function_word_list)-val)+1)**0.5] for key,val in zip(list(create_vocab(function_word_list).keys()),list(create_vocab(function_word_list).values()))]
    return sorted(correlation_list,key=operator.itemgetter(1),reverse=True)[:20]
    


In [421]:
function_word_train = {}
for label in set(df_train['NE']):
    if (label != 'O' and label != '_space' and label != 'B-LOC' and label != 'B-ORG' and label != 'B-PER' and label != 'B-MISC'):
        function_word_train[label] = function_word(df_train,label)

Step3：局部特征选取

In [422]:
## POS tag
tag2index = {}
index2tag = {}
n = 2
tag2index['punc'] = 1
index2tag[1] = 'punc'
tag2index['unk'] = 0
index2tag[0] = 'unk'
for item in set(df_train['POS']):
    if (item.isupper() == True):
        tag2index[item] = n
        index2tag[n] = item
        n += 1
    
        

In [423]:
def POS_tagger(df):
    tag_list =[]
    for tag in df['POS']:
        if tag in list(tag2index.keys()):
            tag_list.append(tag2index[tag])
        else:
            if tag.isupper() == False:
                tag_list.append(tag2index['punc'])
            else:
                tag_list.append(tag2index['unk'])
    return tag_list

In [424]:
POS_tag_train = POS_tagger(df_train)

In [425]:
POS_tag_test = POS_tagger(df_test)

In [426]:
## 句首词，大小写，文档的位置（四个参数：首字母大写与否、是不是句首词、全大写与否、文档的位置（头文件:1，标题:2，信息:3，文本:4））
def first_capital(df):
    tag_list = []
    for word in df['word']:
        if word[0].isupper():
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [427]:
def first_word(df):
    tag_list = []
    for i in range(len(df)):
        if (i>0):
            if (df['word'][i-1] == '_space' or df['word'][i-1] == '.' or df['word'][i-1] == '!' or df['word'][i-1] == '?'):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(1)
    return tag_list

In [428]:
def all_capitals(df):
    tag_list = []
    for word in df['word']:
        if word.isupper():
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list
        

In [429]:
def document_locations(df,docs):
    tag_list = []
    docs_lens = []
    docs_segments = []
    for doc in docs:
        split_points = []
        i = 0
        while (i < 3):
            for n in range(len(doc)):
                if doc[n] == '_space':
                    split_points.append(n)
                    i += 1
        docs_segments.append(split_points)
        docs_lens.append(len(doc))
        
    
    num_words = 0
    for j in range(len(docs_lens)):
        check_words = df['word'][num_words:num_words+docs_lens[j]]
        for cw in range(len(check_words)):
            if (cw <= docs_segments[j][0]):
                tag_list.append(1)
            elif (cw > docs_segments[j][0] and cw <= docs_segments[j][1]):
                tag_list.append(2)
            elif (cw > docs_segments[j][1] and cw <= docs_segments[j][2]):
                tag_list.append(3)
            else:
                tag_list.append(4)
        num_words += docs_lens[j]
    
    if (len(tag_list) == len(df)):
        return tag_list
    else:
        print('error')
        return

In [430]:
first_cap_train = first_capital(df_train)
first_word_train = first_word(df_train)
all_capitals_train = all_capitals(df_train)
doc_loc_train = document_locations(df_train,docs_train)

In [431]:
first_cap_test = first_capital(df_test)
first_word_test = first_word(df_test)
all_capitals_test = all_capitals(df_test)
doc_loc_test = document_locations(df_test,docs_test)

In [432]:
## 语言符号信息 （主要包括. , / % $ -） 以及数字信息
def symbal_information(df):
    tag_list = []
    for word in df['word']:
        if ('.' in word or ',' in word or '/' in word or '%' in word or '$' in word or '-' in word):
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [433]:
def number_information(df):
    tag_list = []
    for word in df['word']:
        if (re.search('\d',word) != None):
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [434]:
symbal_infor_train = symbal_information(df_train)
number_infor_train = number_information(df_train)

In [435]:
symbal_infor_test = symbal_information(df_test)
number_infor_test = number_information(df_test)

In [436]:
## 引号，括号信息
def filled_information(df):
    li_1 = []
    li_2 = []
    li_3 = []
    li_4 = []
    li_5 = []
    n_1 = 0
    n_2 = 0
    n_3 = 0
    n_4 = 0
    n_5 = 0
    for word in df['word']:
        if (word == '('):
            n_1 = 1
        if (word == ')'):
            n_1 = 0
        if (word == '['):
            n_2 = 1
        if (word == ']'):
            n_2 = 0
        if (word == '{'):
            n_3 = 1
        if (word == '}'):
            n_3 =0
        if (word == "'"):
            n_4 += 1
        if (word == '"'):
            n_5 += 1
        li_1.append(n_1)
        li_2.append(n_2)
        li_3.append(n_3)
        li_4.append(n_4%2)
        li_5.append(n_5%2)
        
    tag_list = [int(x) for x in list(np.array(li_1)+np.array(li_2)+np.array(li_3)+np.array(li_4)+np.array(li_5))]
    
    return tag_list
        
    
    

In [437]:
fil_infor_train = filled_information(df_train)

In [438]:
fil_infor_test = filled_information(df_test)

In [439]:
## 常用词列表
def common_word(df):
    tag_list = []
    for word in df['word']:
        if word not in common_vocab:
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [440]:
common_word_train =  common_word(df_train)

In [441]:
common_word_test =  common_word(df_test)

In [442]:
## 二元特征提取
def bi_feature(df,label):
    tag_list = []
    tag_list.append(0)
    tag_list.append(0)
    for i in range(2,len(df)):
        if (df['NE'][i] != 'O' and df['NE'][i] != '_space'):
            if (df['NE'][i].split('-')[1] == label and (df['word'][i-2],df['word'][i-1]) in [x for [x,y] in bi_co_train['I-'+label]]):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    return tag_list
        

In [443]:
bi_LOC_train = bi_feature(df_train,'LOC')
bi_PER_train = bi_feature(df_train,'PER')
bi_ORG_train = bi_feature(df_train,'ORG')
bi_MISC_train = bi_feature(df_train,'MISC')

In [444]:
bi_LOC_test = bi_feature(df_test,'LOC')
bi_PER_test = bi_feature(df_test,'PER')
bi_ORG_test = bi_feature(df_test,'ORG')
bi_MISC_test = bi_feature(df_test,'MISC')

In [445]:
## 词后缀特征
def suffix_feature(df,label):
    tag_list = []
    for i in range(len(df)):
        if (df['NE'][i] != 'O' and df['NE'][i] != '_space'):
            if (df['NE'][i].split('-')[1] == label and df['word'][i][-3:-1]+df['word'][i][-1] in [x for [x,y] in suffix_train['I-'+label]]):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    return tag_list

In [446]:
suffix_LOC_train = suffix_feature(df_train,'LOC')
suffix_PER_train = suffix_feature(df_train,'PER')
suffix_ORG_train = suffix_feature(df_train,'ORG')
suffix_MISC_train = suffix_feature(df_train,'MISC')

In [447]:
suffix_LOC_test = suffix_feature(df_test,'LOC')
suffix_PER_test = suffix_feature(df_test,'PER')
suffix_ORG_test = suffix_feature(df_test,'ORG')
suffix_MISC_test = suffix_feature(df_test,'MISC')

In [448]:
## 类别后缀特征(一元特征)
def c_suffix_feature(df,label):
    tag_list = []
    for i in range(len(df)-1):
        if (df['NE'][i+1] != 'O' and df['NE'][i+1] != '_space'):
            if (df['NE'][i+1].split('-')[1] == label and df['word'][i+1][0].isupper() == True and df['word'][i][0].isupper() == True):
                tag_list.append(1)
            else:
                tag_list.append(0)
        else:
            tag_list.append(0)
    tag_list.append(0)
    return tag_list

In [449]:
csuffix_LOC_train = c_suffix_feature(df_train,'LOC')
csuffix_PER_train = c_suffix_feature(df_train,'PER')
csuffix_ORG_train = c_suffix_feature(df_train,'ORG')
csuffix_MISC_train = c_suffix_feature(df_train,'MISC')

In [450]:
csuffix_LOC_test = c_suffix_feature(df_test,'LOC')
csuffix_PER_test = c_suffix_feature(df_test,'PER')
csuffix_ORG_test = c_suffix_feature(df_test,'ORG')
csuffix_MISC_test = c_suffix_feature(df_test,'MISC')

In [451]:
## 功能词汇特征
def functional_word_train(df,label):
    tag_list = []
    for word in df['word']:
        if word in [x for [x,y] in function_word_train[label]]:
            tag_list.append(1)
        else:
            tag_list.append(0)
    return tag_list

In [452]:
functional_LOC_train = functional_word_train(df_train,'I-LOC')
functional_PER_train = functional_word_train(df_train,'I-PER')
functional_ORG_train = functional_word_train(df_train,'I-ORG')
functional_MISC_train = functional_word_train(df_train,'I-MISC')

In [453]:
functional_LOC_test = functional_word_train(df_test,'I-LOC')
functional_PER_test = functional_word_train(df_test,'I-PER')
functional_ORG_test = functional_word_train(df_test,'I-ORG')
functional_MISC_test = functional_word_train(df_test,'I-MISC')

Step4：全局特征选取

In [454]:
## 全局一元特征 
def global_uni_feature(docs,label):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [a-1 for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove(j-1)
            is_uni = 0
            for ind in inds:
                if docs[i][ind] in [x for [x,_] in uni_co_train[label]]:
                    is_uni = 1
            tag_list.append(is_uni)     
            n += 1
    return tag_list

In [455]:
guni_LOC_train = global_uni_feature(docs_train,'I-LOC')
guni_PER_train = global_uni_feature(docs_train,'I-PER')
guni_ORG_train = global_uni_feature(docs_train,'I-ORG')
guni_MISC_train = global_uni_feature(docs_train,'I-MISC')

In [456]:
guni_LOC_test = global_uni_feature(docs_test,'I-LOC')
guni_PER_test = global_uni_feature(docs_test,'I-PER')
guni_ORG_test = global_uni_feature(docs_test,'I-ORG')
guni_MISC_test = global_uni_feature(docs_test,'I-MISC')

In [457]:
## 全局二元特征 
def global_bi_feature(docs,label):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [(a-2,a-1) for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove((j-2,j-1))
            is_bi = 0
            for ind in inds:
                if (docs[i][ind[0]],docs[i][ind[1]]) in [x for [x,_] in bi_co_train[label]]:
                    is_bi = 1
            tag_list.append(is_bi)     
            n += 1
    return tag_list

In [458]:
gbi_LOC_train = global_bi_feature(docs_train,'I-LOC')
gbi_PER_train = global_bi_feature(docs_train,'I-PER')
gbi_ORG_train = global_bi_feature(docs_train,'I-ORG')
gbi_MISC_train = global_bi_feature(docs_train,'I-MISC')

In [459]:
gbi_LOC_test = global_bi_feature(docs_test,'I-LOC')
gbi_PER_test = global_bi_feature(docs_test,'I-PER')
gbi_ORG_test = global_bi_feature(docs_test,'I-ORG')
gbi_MISC_test = global_bi_feature(docs_test,'I-MISC')

In [460]:
## 全局词后缀特征 
def global_csuffix_feature(docs,label,csuffix):
    tag_list = []
    n = 0
    for i in range(len(docs)):
        for j in range(len(docs[i])):
            inds = [a-j for a,b in enumerate(docs[i]) if b==docs[i][j]]
            inds.remove(0)
            is_suf = 0
            for ind in inds:
                if csuffix[n+ind] == 1:
                    is_suf = 1
            tag_list.append(is_suf)     
            n += 1
    return tag_list

In [461]:
gcsuf_LOC_train = global_csuffix_feature(docs_train,'I-LOC',csuffix_LOC_train)
gcsuf_PER_train = global_csuffix_feature(docs_train,'I-PER',csuffix_PER_train)
gcsuf_ORG_train = global_csuffix_feature(docs_train,'I-ORG',csuffix_ORG_train)
gcsuf_MISC_train = global_csuffix_feature(docs_train,'I-MISC',csuffix_MISC_train)

In [462]:
gcsuf_LOC_test = global_csuffix_feature(docs_test,'I-LOC',csuffix_LOC_test)
gcsuf_PER_test = global_csuffix_feature(docs_test,'I-PER',csuffix_PER_test)
gcsuf_ORG_test = global_csuffix_feature(docs_test,'I-ORG',csuffix_ORG_test)
gcsuf_MISC_test = global_csuffix_feature(docs_test,'I-MISC',csuffix_MISC_test)

Step5：训练CRF模型

In [463]:
train_input = [POS_tag_train,
symbal_infor_train,
number_infor_train,
first_cap_train,
first_word_train,
all_capitals_train,
doc_loc_train,
fil_infor_train,
common_word_train,
bi_LOC_train,
bi_PER_train, 
bi_ORG_train, 
bi_MISC_train,
suffix_LOC_train,
suffix_PER_train,
suffix_ORG_train, 
suffix_MISC_train,
csuffix_LOC_train,
csuffix_PER_train,
csuffix_ORG_train,
csuffix_MISC_train,
functional_LOC_train,
functional_PER_train,
functional_ORG_train,
functional_MISC_train,
guni_LOC_train,
guni_PER_train,
guni_ORG_train,
guni_MISC_train,
gbi_LOC_train,
gbi_PER_train,
gbi_ORG_train,
gbi_MISC_train,
gcsuf_LOC_train,
gcsuf_PER_train,
gcsuf_ORG_train,
gcsuf_MISC_train]

test_input = [POS_tag_test,
symbal_infor_test,
number_infor_test,
first_cap_test,
first_word_test,
all_capitals_test,
doc_loc_test,
fil_infor_test,
common_word_test,
bi_LOC_test,
bi_PER_test, 
bi_ORG_test, 
bi_MISC_test,
suffix_LOC_test,
suffix_PER_test,
suffix_ORG_test,
suffix_MISC_test,
csuffix_LOC_test,
csuffix_PER_test,
csuffix_ORG_test,
csuffix_MISC_test,
functional_LOC_test,
functional_PER_test,
functional_ORG_test,
functional_MISC_test,
guni_LOC_test,
guni_PER_test,
guni_ORG_test,
guni_MISC_test,
gbi_LOC_test,
gbi_PER_test,
gbi_ORG_test,
gbi_MISC_test,
gcsuf_LOC_test,
gcsuf_PER_test,
gcsuf_ORG_test,
gcsuf_MISC_test]

In [464]:
## 标签
NE_list = {}
NE_list['O'] = 0
NE_list['_space'] = 1
NE_list['B-LOC'] = 2
NE_list['I-LOC'] = 3
NE_list['B-PER'] = 4
NE_list['I-PER'] = 5
NE_list['B-ORG'] = 6
NE_list['I-ORG'] = 7
NE_list['B-MISC'] = 8
NE_list['I-MISC'] = 9
def output():
    NE_train = []
    NE_test = []
    for tag in df_train['NE']:
        NE_train.append(NE_list[tag])
    for tag_t in df_test['NE']:
        NE_test.append(NE_list[tag_t])
    return NE_train,NE_test

In [465]:
y_train, y_test = output()

In [None]:
## 训练CRF模型
