In [212]:
import tensorflow as tf
from gensim.models import word2vec, KeyedVectors
import codecs
import os
import numpy as np
import sys
import random
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.utils import to_categorical 
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical 
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation
from keras.models import load_model
from keras_contrib.layers.crf import CRF

In [213]:
# 一、数据预处理
# BASE_DIR = 'F:\PythonProject\BiLSTM-CRF-4-NER'  # my DELL
BASE_DIR='E:\PycharmProjects\BiLSTM-CRF-4-NER' # LAB
TRAIN_TEXT_DIR = BASE_DIR + '/data/train'
TEST_TEXT_DIR = BASE_DIR + '/data/test'
MAX_SEQUENCE_LENGTH = 100 # 一份文本中最大字数
MAX_NB_WORDS = 20000 # 全部文本中最大unique字数
VALIDATION_SPLIT = 0.2
batch_size = 32
labels_index = {'B-nr':1, 'B-ns':2, 'B-nt':3, 'I-nr':4, 'I-ns':5, 'I-nt':6, 'O':0} # 标签名-id 字典

In [214]:
# 1.加载预训练好的embedding模型——字：向量 词典
char_model = word2vec.Word2Vec.load("char_model.model")
word_vector_size = char_model.wv.vector_size
vocab_size = len(char_model.wv.vocab) # 4767

In [215]:
vocab_size

4767

In [216]:
char_model.wv.get_keras_embedding

<bound method Word2VecKeyedVectors.get_keras_embedding of <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x00000190CD219A20>>

In [217]:
# 2. 预备文本和标签
def make_list(file_path):
    texts = []
    labels = []
    sentence = []
    sentence_labels = []
    f = open(file_path, encoding='utf8')
    for line in f:
        if(line != "\n"):
            values = line.split("\t");
            char = values[0]
            label = values[1].replace("\n","")
            sentence.append(char)
            sentence_labels.append(labels_index.get(label, "0"))
        else:
            texts.append(sentence)
            labels.append(sentence_labels)
            sentence=[]
            sentence_labels=[]
    f.close()
    return texts, labels

In [218]:
texts, labels = make_list(TRAIN_TEXT_DIR)

In [219]:
len(labels[0]),len(texts[0])

(50, 50)

In [220]:
# 2.建字表——字：id 词典
tokenizer = Tokenizer(num_words = min(vocab_size,MAX_NB_WORDS))
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index # 4652

In [221]:
# len(word_index) # 4652

In [222]:
# len(sequences[0]),len(sequences[1])

In [223]:
labels[1]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [224]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = pad_sequences(labels, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (46364, 100)
Shape of label tensor: (46364, 100, 7)


In [225]:
len(labels[0]),len(data[0])

(100, 100)

In [226]:
# 将数据分成训练集和验证集
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples=int(VALIDATION_SPLIT * data.shape[0])

In [227]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

In [228]:
x_train.shape

(37092, 100)

In [229]:
y_train.shape

(37092, 100, 7)

In [230]:
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [231]:
len(x_val),len(y_val)

(9272, 9272)

In [232]:
# 生成embedding_matrix = id-向量 表
nb_words = min(MAX_NB_WORDS, len(word_index)) 
embedding_matrix = np.zeros((nb_words+1, word_vector_size))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    try:
        embedding_vector = char_model.wv.get_vector(word)
    except KeyError as e:
        embedding_vecotr = np.zeros(word_vector_size)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

(4653, 100)


In [233]:
# LSTM构建
embedding_layer = Embedding(nb_words+1,
                            word_vector_size,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False
                            )
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, return_sequences=True))
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(len(labels_index),activation='softmax'))
crf_layer = CRF(len(labels_index))
model.add(crf_layer)

In [234]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          465300    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 100)          80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100, 100)          10100     
_________________________________________________________________
dense_6 (Dense)              (None, 100, 7)            707       
_________________________________________________________________
crf_7 (CRF)                  (None, 100, 7)            119       
Total params: 556,626
Trainable params: 91,326
Non-trainable params: 465,300
_________________________________________________________________


In [235]:
# 六、LSTM训练
# 1. 编译
# model.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])
model.compile('rmsprop', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
# 2.拟合
model.fit(x_train, y_train, batch_size = 128, epochs = 100,
         validation_data=(x_val, y_val))

Train on 37092 samples, validate on 9272 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x190cee33c18>

In [236]:
from keras_contrib.utils import save_load_utils
filename = 'BiLSTM_CRF.h5'
save_load_utils.save_all_weights(model,filename)

In [246]:
len(x_val), len(x_train)

(9272, 37092)

In [244]:
model.evaluate(x_val, y_val)



[-0.22058069649392723, 0.9883930103546387]

In [None]:
y = model.predict(x_val)

In [130]:
def make_test_data(input_file):
        texts = []
        input_data = codecs.open(input_file, 'r', 'utf-8')
        for line in input_data.readlines():
            words=[]
            for word in line:
                words.append(word)
            texts.append(words)
        return texts

In [131]:
test_texts = make_test_data(TEST_TEXT_DIR)

In [133]:
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [148]:
test_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 172,
       167, 127, 126,   5, 172, 167,  72, 403, 135])

In [137]:
test_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [138]:
len(test_sequences[0])

100

In [139]:
test_predict = model.predict(test_sequences)

In [149]:
len(test_predict[32])

100

In [150]:
test_predict[32]

array([[1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
 

In [258]:
line = "本报波士顿１１月１日电记者陈特安、李云飞报道：江泽民主席一行今天上午乘专机从纽约抵达波士顿访问。"
texts = []
words=[]
for word in line:
    words.append(word)
texts.append(words)

In [259]:
texts

[['本',
  '报',
  '波',
  '士',
  '顿',
  '１',
  '１',
  '月',
  '１',
  '日',
  '电',
  '记',
  '者',
  '陈',
  '特',
  '安',
  '、',
  '李',
  '云',
  '飞',
  '报',
  '道',
  '：',
  '江',
  '泽',
  '民',
  '主',
  '席',
  '一',
  '行',
  '今',
  '天',
  '上',
  '午',
  '乘',
  '专',
  '机',
  '从',
  '纽',
  '约',
  '抵',
  '达',
  '波',
  '士',
  '顿',
  '访',
  '问',
  '。']]

In [249]:
test_sequences = tokenizer.texts_to_sequences(texts)

In [250]:
test_sequences

[[87,
  109,
  786,
  486,
  1194,
  14,
  14,
  91,
  14,
  33,
  136,
  197,
  89,
  789,
  193,
  251,
  5,
  518,
  492,
  729,
  109,
  199,
  135,
  465,
  813,
  57,
  60,
  504,
  4,
  39,
  130,
  98,
  22,
  731,
  1270,
  304,
  113,
  120,
  1234,
  476,
  1225,
  259,
  786,
  486,
  1194,
  400,
  121,
  3]]

In [251]:
test_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [252]:
test_sequences

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,   87,  109,  786,
         486, 1194,   14,   14,   91,   14,   33,  136,  197,   89,  789,
         193,  251,    5,  518,  492,  729,  109,  199,  135,  465,  813,
          57,   60,  504,    4,   39,  130,   98,   22,  731, 1270,  304,
         113,  120, 1234,  476, 1225,  259,  786,  486, 1194,  400,  121,
           3]])

In [253]:
y_test = model.predict(test_sequences)

In [254]:
y_test

array([[[1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1.,

In [255]:
labels_index

{'B-nr': 1, 'B-ns': 2, 'B-nt': 3, 'I-nr': 4, 'I-ns': 5, 'I-nt': 6, 'O': 0}

In [187]:
y_val=[1, 0, 0, 0, 0, 0, 0]

TypeError: can't multiply sequence by non-int of type 'list'

In [264]:
test_predict = model.predict(test_sequences)

In [265]:
test_predict

array([[[1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1.,

In [321]:
test_label_ids = np.argmax(test_predict,axis=2)
test_label_ids

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        4, 4, 0, 1, 4, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 2, 0, 0, 0, 3, 6, 6, 6, 6, 0]], dtype=int64)

In [362]:
index_labels = dict([index, label] for (label, index) in labels_index.items())

In [353]:
range(len(texts))

range(0, 1)

In [370]:
lengths=[]
for i in range(len(texts)):
    lengths.append(len(texts[i]))
lengths, test_label_ids

([48],
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         4, 4, 0, 1, 4, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 2, 0, 0, 0, 3, 6, 6, 6, 6, 0]], dtype=int64))

In [401]:
# %load postprocess.py


def ids_to_tags(ids, id2tag, lengths=None):
    if lengths is None:
        return list(map(lambda x: [id2tag.get(i) for i in x], ids))
    else:
        tags = []
        for id_, length in zip(ids, lengths):
            tags.append([id2tag.get(i) for i in id_[-length:]])
        return tags


def split_by_tags(sentences, tags, tag_format='BIO'):
    """

    :param list sentences: sentences [['今', '日', '查', '房'], ['患', '者'], ....] or ['今日查房', '患者....', ...]
    :param list tags: [['B', 'I', 'O'], ['B', 'I', 'O'], ...]
    :param str tag_format: 'BIO' or 'BMESO'
    :return:
    """
    result = []
    if tag_format not in ['BIO', 'BMESO']:
        raise ValueError('unsupported tag format')
    for sentence, tag in zip(sentences, tags):
        one_result = []
        if isinstance(sentence, list):
            sentence = ''.join(sentence)
        if tag_format == 'BIO':
            start, end = 0, 0
            for i in range(len(tag)):
                if tag[i][0] == 'B':
                    start = i
                elif tag[i][0] == 'I':
                    if i != len(tag) - 1:
                        if tag[i+1][0] == 'B':
                            end = i + 1
                            one_result.append({'word': sentence[start:end],
                                               'start': start, 'end': end, 'tag': tag[start][2:]})
                elif tag[i][0] == 'O':
                    if i == 0:
                        continue
                    else:
                        if tag[i-1][0] == 'B' or tag[i-1][0] == 'I':
                            end = i
                            one_result.append({'word': sentence[start:end],
                                               'start': start, 'end': end, 'tag': tag[start][2:]})
                if i == len(tag)-1 and end < start:
                    end = i + 1
                    one_result.append({'word': sentence[start:end],
                                       'start': start, 'end': end, 'tag': tag[start][2:]})
        else:
            start, end = 0, 0
            for i, t in enumerate(tag):
                if t[0] == 'B':
                    start = i
                elif t[0] == 'E':
                    end = i + 1
                    one_result.append({'word': sentence[start:end], 'start': start, 'end': end, 'tag': tag[start][2:]})
                elif t[0] == 'S':
                    start = i
                    end = i + 1
                    one_result.append({'word': sentence[start:end], 'start': start, 'end': end, 'tag': tag[start][2:]})

        result.append(one_result)
    return result


In [402]:
test_tags = ids_to_tags(test_label_ids, index_labels, lengths=[48])

In [365]:
index_label

{1: 'B-nr', 2: 'B-ns', 3: 'B-nt', 4: 'I-nr', 5: 'I-ns', 6: 'I-nt', 0: 'O'}

In [403]:
test_tags

[['O',
  'O',
  'B-ns',
  'I-ns',
  'I-ns',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-nr',
  'I-nr',
  'I-nr',
  'O',
  'B-nr',
  'I-nr',
  'O',
  'O',
  'O',
  'O',
  'B-nr',
  'I-nr',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ns',
  'O',
  'O',
  'O',
  'B-nt',
  'I-nt',
  'I-nt',
  'I-nt',
  'I-nt',
  'O']]

In [404]:
postprocess.split_by_tags(texts, test_tags, tag_format='BIO')

[[{'word': '波士顿', 'start': 2, 'end': 5, 'tag': 'ns'},
  {'word': '陈特安', 'start': 13, 'end': 16, 'tag': 'nr'},
  {'word': '李云', 'start': 17, 'end': 19, 'tag': 'nr'},
  {'word': '江泽', 'start': 23, 'end': 25, 'tag': 'nr'},
  {'word': '纽', 'start': 38, 'end': 39, 'tag': 'ns'},
  {'word': '波士顿访问', 'start': 42, 'end': 47, 'tag': 'nt'}]]