In [83]:
import tensorflow as tf
from gensim.models import word2vec, KeyedVectors
import codecs
import os
import numpy as np
import sys
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation

In [84]:
# 一、数据预处理
BASE_DIR = 'F:\PythonProject\BiLSTM-CRF-4-NER'
TRAIN_TEXT_DIR = BASE_DIR + '/data/train'
TEST_TEXT_DIR = BASE_DIR + '/data/test'
MAX_SEQUENCE_LENGTH = 100 # 一份文本中最大字数
MAX_NB_WORDS = 20000 # 全部文本中最大unique字数
VALIDATION_SPLIT = 0.2
batch_size = 32

In [85]:
# 1.加载预训练好的embedding模型——字：向量 词典
char_model = word2vec.Word2Vec.load("char_model.model")
word_vector_size = char_model.wv.vector_size
vocab_size = len(char_model.wv.vocab) # 4767

In [86]:
vocab_size

4767

In [87]:
char_model.wv.get_keras_embedding

<bound method Word2VecKeyedVectors.get_keras_embedding of <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000002AB50573198>>

In [128]:
# 2. 预备文本和标签
def make_list(file_path):
    texts = []
    labels_index = {'B-nr':1, 'B-ns':2, 'B-nt':3, 'I-nr':4, 'I-ns':5, 'I-nt':6, 'O':0} # 标签名-id 字典
    labels = []
    sentence = []
    sentence_labels = []
    f = open(file_path, encoding='utf8')
    for line in f:
        if(line != "\n"):
            values = line.split("\t");
            char = values[0]
            label = values[1].replace("\n","")
            sentence.append(char)
            sentence_labels.append(labels_index.get(label, "0"))
        else:
            texts.append(sentence)
            labels.append(sentence_labels)
            sentence=[]
            sentence_labels=[]
    f.close()
    return texts, labels

In [145]:
texts, labels = make_list(TRAIN_TEXT_DIR)

In [146]:
len(labels[0]),len(texts[0])

(50, 50)

In [147]:
# 2.建字表——字：id 词典
tokenizer = Tokenizer(num_words = min(vocab_size,MAX_NB_WORDS))
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index # 4652

In [148]:
# len(word_index) # 4652

In [149]:
# len(sequences[0]),len(sequences[1])

In [150]:
labels[1]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [151]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = pad_sequences(labels, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (46364, 100)
Shape of label tensor: (46364, 100, 7)


In [152]:
len(labels[0]),len(data[0])

(100, 100)

In [154]:
# 将数据分成训练集和验证集
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples=int(VALIDATION_SPLIT * data.shape[0])

In [155]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

In [156]:
x_train.shape

(37092, 100)

In [157]:
y_train.shape

(37092, 100, 7)

In [158]:
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [169]:
# 生成embedding_matrix = id-向量 表
nb_words = min(MAX_NB_WORDS, len(word_index)) 
embedding_matrix = np.zeros((nb_words+1, word_vector_size))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    try:
        embedding_vector = char_model.wv.get_vector(word)
    except KeyError as e:
        embedding_vecotr = np.zeros(word_vector_size)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

(4653, 100)


In [174]:
# LSTM构建
embedding_layer = Embedding(nb_words+1,
                            word_vector_size,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False
                            )
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, return_sequences=True))
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(len(labels_index),activation='softmax'))

In [175]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          465300    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 100)          80400     
_________________________________________________________________
dense (Dense)                (None, 100, 100)          10100     
_________________________________________________________________
dense_1 (Dense)              (None, 100, 7)            707       
Total params: 556,507
Trainable params: 91,207
Non-trainable params: 465,300
_________________________________________________________________


In [176]:
# 六、LSTM训练
# 1. 编译
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# 2.拟合
model.fit(x_train, y_train, batch_size = 128, epochs = 5,
         validation_data=(x_val, y_val))

Train on 37092 samples, validate on 9272 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


AttributeError: 'Sequential' object has no attribute 'evalueate'

In [178]:
score, acc = model.evaluate(x_val,y_val, batch_size=256)
print('Test score:', score)
print('Test accuary:',acc)

Test score: 0.06852868658983059
Test accuary: 0.978349863402899
