In [2]:
from keras.models import Model,Sequential
from keras.layers import Input,Dense,Embedding,Flatten,recurrent

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [8]:
path = '../../../data/' #glove词向量目录
def load_word2vec(word2vec_dim):
    word2vec_dict = dict()
    filename = 'glove.6B.%sd.txt'%word2vec_dim
    filepath = path + filename
    f = open(filepath,encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:], dtype='float32')
        word2vec_dict[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(word2vec_dict))
    return word2vec_dict

def get_embedding_matrix(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    encoded_docs = tokenizer.texts_to_sequences(docs)
    max_length = max([len(seq) for seq in encoded_docs])#最长句子长度
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    vocab_size = len(tokenizer.word_index)+1
    word2vec_dim=50
    word2vec_dict = load_word2vec(word2vec_dim)
    
    embedding_matrix = np.zeros((vocab_size, word2vec_dim))
    
    for word, i in tokenizer.word_index.items():
        embedding_vector = word2vec_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        
    return padded_docs,embedding_matrix

In [9]:
def create_train_model(padded_docs,labels,embedding_matrix):
    _,inputLength = padded_docs.shape
    print(inputLength)
    inputDim,outputDim = embedding_matrix.shape
    print(inputDim,outputDim)
    # define model
    model = Sequential()
    e = Embedding(input_dim=inputDim,input_length=inputLength,output_dim=outputDim, weights=[embedding_matrix], trainable=False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    # summarize the model
    print(model.summary())
    # fit the model
    model.fit(padded_docs, labels, epochs=20, verbose=0)
    # evaluate the model
    loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
    print('Accuracy: %f' % (accuracy*100))    
    return model

In [10]:
def train_model(padded_docs,labels,embedding_matrix):
    _,inputLength = padded_docs.shape
    print(inputLength)
    inputDim,outputDim = embedding_matrix.shape
    print(inputDim,outputDim)
    # define model
    inputs = Input(shape=(inputLength,))
    x = Embedding(inputDim,outputDim,weights=[embedding_matrix], trainable=False,mask_zero=True)(inputs)
    x = recurrent.LSTM(outputDim)(x)
    preds = Dense(1, activation='sigmoid')(x)
    
    model=Model(inputs,preds)
     # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    # summarize the model
    print(model.summary())
    # fit the model
    model.fit(padded_docs, labels, epochs=20, verbose=0)
    # evaluate the model
    loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
    print('Accuracy: %f' % (accuracy*100))    
    return model   

In [87]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

In [123]:
padded_docs,embedding_matrix = get_embedding_matrix(docs)


Loaded 400000 word vectors.


In [124]:
padded_docs

array([[ 6,  2,  0,  0],
       [ 3,  1,  0,  0],
       [ 7,  4,  0,  0],
       [ 8,  1,  0,  0],
       [ 9,  0,  0,  0],
       [10,  0,  0,  0],
       [ 5,  4,  0,  0],
       [11,  3,  0,  0],
       [ 5,  1,  0,  0],
       [12, 13,  2, 14]])

In [119]:
create_train_model(padded_docs,labels,embedding_matrix)

4
15 50
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 4, 50)             750       
_________________________________________________________________
flatten_7 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 201       
Total params: 951
Trainable params: 201
Non-trainable params: 750
_________________________________________________________________
None
Accuracy: 100.000000


<keras.engine.sequential.Sequential at 0x1f08aeafdd8>

In [132]:
train_model(padded_docs,labels,embedding_matrix)

4
15 50
Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 4)                 0         
_________________________________________________________________
embedding_21 (Embedding)     (None, 4, 50)             750       
_________________________________________________________________
lstm_9 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 51        
Total params: 21,001
Trainable params: 20,251
Non-trainable params: 750
_________________________________________________________________
None
Accuracy: 100.000000


<keras.engine.training.Model at 0x1f08fc63518>

In [130]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
encoded_docs = tokenizer.texts_to_sequences(docs)
print(encoded_docs)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
