In [39]:
import os
import re
import sys
import json
import pickle
import logging
import itertools
import numpy as np
import pandas as pd
import gensim as gs
import jieba
from pprint import pprint
from collections import Counter
from tensorflow.contrib import learn
import gensim

logging.getLogger().setLevel(logging.INFO)

pattern = re.compile(r'(\d)')

def clean_str(s):
	s = s.replace('？','?')\
		.replace('。',' . ')\
		.replace('，',',')\
		.replace('；',' ; ')\
		.replace('：',':')\
		.replace('【','[')\
		.replace('】',']')\
		.replace('￥','$')\
		.replace('……','^')\
		.replace('、',',')\
		.replace('‘',"'")\
		.replace('’',"'")\
		.replace('“','"')\
		.replace('”','"')\
		.replace('（','(')\
		.replace('）',')')
	s = re.sub(r"[^\u4e00-\u9fa5\-\.\/\@\[A-Za-z0-9:(),!?\'\`]", " ", s)
	s = re.sub(r" : ", ":", s)
	s = re.sub(r"\'s", " \'s", s)
	s = re.sub(r"\'ve", " \'ve", s)
	s = re.sub(r"n\'t", " n\'t", s)
	s = re.sub(r"\'re", " \'re", s)
	s = re.sub(r"\'d", " \'d", s)
	s = re.sub(r"\'ll", " \'ll", s)
	s = re.sub(r",", " , ", s)
	s = re.sub(r"!", " ! ", s)
	s = re.sub(r"\(", " \( ", s)
	s = re.sub(r"\)", " \) ", s)
	s = re.sub(r"\[", " \[ ", s)
	s = re.sub(r"\]", " \] ", s)
	s = re.sub(r"\?", " \? ", s)
	s = re.sub(r"\s{2,}", " ", s)
	words=jieba.lcut(s.strip().lower(),HMM=False)
	result=[]
	for i in range(len(words)):
		word=words[i]
		list=re.split(pattern,word)
		list = [item for item in filter(lambda x:x != '', list)]
		result=result+list
	return result

def pad_sentences(sentences,padding_word='<PAD/>',forced_sequence_length=None):
	"""pad sentences during training or prediction"""
	if forced_sequence_length is None:
		sequence_length=max(len(x) for x in sentences)
	else:
		logging.critical('this is prediction ,readinig the trained sequence length')
		sequence_length=forced_sequence_length
	logging.critical('the maximun length is {}'.format(sequence_length))
	
	padded_sentences=[]
	for i in range(len(sentences)):
		sentence=sentences[i]
		num_padding=sequence_length-len(sentence)
		
		if num_padding<0:
			padded_sentence=sentence[0:sequence_length]
			logging.info('"%s" has to be cut off because it is longer than max_len '%(' '.join(padded_sentence)))
		else:
			padded_sentence=sentence+[padding_word]*num_padding
		padded_sentences.append(padded_sentence)
	return padded_sentences

def load_embeddings(vocabulary,word2vec_path=None):
	word_embeddings={}
	if word2vec_path is not None:
		word2vec = gensim.models.Word2Vec.load(word2vec_path)
	for word in vocabulary:
		if word2vec_path is not None and word in word2vec.wv.vocab:
			word_embeddings[word]=word2vec.wv[word]
		else:
			word_embeddings[word] = np.random.uniform(-0.25, 0.25, 256)
	del word2vec
	return word_embeddings

def batch_iter(data, batch_size, num_epochs, shuffle=True):
	data = np.array(data)
	data_size = len(data)
	num_batches_per_epoch = int(data_size / batch_size) + 1

	for epoch in range(num_epochs):
		if shuffle:
			shuffle_indices = np.random.permutation(np.arange(data_size))
			shuffled_data = data[shuffle_indices]
		else:
			shuffled_data = data

		for batch_num in range(num_batches_per_epoch):
			start_index = batch_num * batch_size
			end_index = min((batch_num + 1) * batch_size, data_size)
			yield shuffled_data[start_index:end_index]


def bulid_vocab(sentences):
	word_counts=Counter(itertools.chain(*sentences))
	vocabulary_inv=[word[0] for word in word_counts.most_common()]#按词频构造字典
	vocabulary={word:index for index,word in enumerate(vocabulary_inv)}
	return vocabulary,vocabulary_inv

def load_data(filename,cnum=100):
    df = pd.read_csv(filename)
    df = df[:cnum] 
    selected=['Category','Text']
    non_selected=list(set(df.columns)-set(selected))

    df=df.drop(non_selected,axis=1)#去掉不需要的列
    df=df.dropna(axis=0,how='any',subset=selected)#去掉空行
    df=df.reindex(np.random.permutation(df.index))#打乱行顺序

    labels=sorted(list(set(df[selected[0]].tolist())))#分类标签
    num_labels=len(labels)
    one_hot=np.zeros((num_labels,num_labels),int)
    np.fill_diagonal(one_hot,1)
    label_dict=dict(zip(labels,one_hot))

    x_raw=df[selected[1]].apply(lambda x:clean_str(x)).tolist()
    y_raw=df[selected[0]].apply(lambda y:label_dict[y]).tolist()

    x_raw=pad_sentences(x_raw)
    vocabulary,vocabulary_inv=bulid_vocab(x_raw)

    x=np.array([[vocabulary[word] for word in sentence] for sentence in x_raw])
    y=np.array(y_raw)

    return x,y,vocabulary,vocabulary_inv,df,labels

In [40]:
from sklearn.model_selection import train_test_split

def get_data():
    input_file = '../data/train.csv'
    x_, y_, vocabulary, vocabulary_inv, df, labels = load_data(input_file,cnum=8000)

    training_config = '../training_config.json'
    params = json.loads(open(training_config, encoding='utf-8').read())

    # 给每个单词分配一个256维度的向量
    word_embeddings = load_embeddings(vocabulary, params['word2vec_path'])
    # 构造输入矩阵
    embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # 将原始数据分割为训练数据和测试数据
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.2)

    # 将训练数据分割为训练数据和验证数据
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

    logging.info('x_train:{},x_val:{},x_test:{}'.format(len(x_train), len(x_val), len(x_test)))
    logging.info('y_train:{},y_val:{},y_test:{}'.format(len(y_train), len(y_val), len(y_test)))

    return x_train,y_train,x_val,y_val,embedding_mat

In [41]:
x_train,y_train,x_val,y_val,embedding_mat = get_data()

CRITICAL:root:the maximun length is 45
INFO:gensim.utils:loading Word2Vec object from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx
INFO:gensim.utils:loading wv recursively from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx.wv.* with mmap=None
INFO:gensim.utils:loading syn0 from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx.wv.syn0.npy with mmap=None
INFO:gensim.utils:loading syn1neg from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx.syn1neg.npy with mmap=None
INFO:gensim.models.word2vec:Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.
INFO:gensim.models.deprecated.old_saveload:loading Word2Vec object from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx
INFO:gensim.models.deprecated.old_saveload:loading wv recursively from D:/我要回珠海/实战项目/MyDataSets/word2vec_from_weixin/word2vec/word2vec_wx.wv.* with mmap=None
INFO:gensim.utils:loading syn0 f

In [72]:
from keras.layers import Input, Dense, LSTM, Embedding, recurrent, Bidirectional, Flatten, Dropout
from keras.models import Model
from keras.initializers import Constant
import pickle
import sys

sys.path.append('..')
from keras_contrib.layers import CRF

Epoch = 1
BATCH_SIZE = 100
BiRNN_UNITS = 100


def get_model(x_train, y_train, embedding_mat):
    inputSize, inputLength = x_train.shape
    #print('inputLength=%s' % inputLength)
    inputDim, outputDim = embedding_mat.shape
    #print('inputDim=%s,outputDim=%s' % (inputDim, outputDim))
    _, class_num = y_train.shape
    #print('class_num=%s' % class_num)

    #with open('../ckpt/embedding_mat.pkl', 'wb') as outp:
    #    pickle.dump((inputLength, class_num, embedding_mat), outp)

    # define model
    inputs = Input(shape=(inputLength,))
    x = Embedding(inputDim, outputDim,embeddings_initializer=Constant(embedding_mat), trainable=False)(inputs)  # mask_zero=True
    x = Bidirectional(LSTM(128))(x)
    x = Dropout(0.3)(x)
    preds = Dense(class_num, activation='softmax')(x)
    model = Model(inputs,preds)
    model.compile('adam', loss='categorical_crossentropy', metrics=['acc'])
    print(model.summary())
    return model

In [77]:
embedding_mat.shape

(7677, 256)

In [73]:
crf_model = get_model(x_train,y_train,embedding_mat)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 45)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 45, 256)           1965312   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 34)                8738      
Total params: 2,368,290
Trainable params: 402,978
Non-trainable params: 1,965,312
_________________________________________________________________
None


In [74]:
EPOCHS = 3
BATCH_SIZE=200
crf_model.fit(x_train,y_train,batch_size=BATCH_SIZE,epochs=EPOCHS, validation_data=[x_val,y_val])
#crf_model.save('../ckpt/crf.h5')
#crf_model.save_weights('../ckpt/crf_weights.h5')

Train on 5120 samples, validate on 1280 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2099de3ac18>

In [75]:
#1.保存模型所有参数
modelWeights = crf_model.get_weights()

with open('../ckpt/modelWeights.pkl', 'wb') as outp:
    pickle.dump(modelWeights, outp)

In [76]:
def init_Model(inputLength,inputDim,outputDim,class_num):
    # define model
    inputs = Input(shape=(inputLength,))
    x = Embedding(inputDim, outputDim,trainable=False)(inputs)  # mask_zero=True
    x = Bidirectional(LSTM(128))(x)
    x = Dropout(0.3)(x)
    preds = Dense(class_num, activation='softmax')(x)
    model = Model(inputs,preds)
    model.compile('adam', loss='categorical_crossentropy', metrics=['acc'])
    print(model.summary())
    return model

In [78]:
fx_model = init_Model(45,7677,256,34)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 45)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 45, 256)           1965312   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 34)                8738      
Total params: 2,368,290
Trainable params: 402,978
Non-trainable params: 1,965,312
_________________________________________________________________
None


In [82]:
# 读取模型参数并配置
with open('../ckpt/modelWeights.pkl', 'rb') as inp:
    modelWeights = pickle.load(inp)


fx_model.set_weights(modelWeights)

In [84]:
fx_model.evaluate(x_val,y_val,batch_size=BATCH_SIZE)



[1.1649398617446423, 0.6664062514901161]

In [36]:
from keras.models import Model
from keras.layers import Input,Dense, Dropout,Flatten,Conv1D,MaxPooling1D

def train_cnn_model(x_train, y_train,x_val, y_val, embedding_mat):
    inputSize,inputLength = x_train.shape
    print('inputLength=%s'%inputLength)
    inputDim,outputDim = embedding_mat.shape
    print('inputDim=%s,outputDim=%s'%(inputDim,outputDim))
    _,class_num = y_train.shape
    print('class_num=%s'%class_num)
    
    inputs = Input(shape=(inputLength,))
    x = Embedding(inputDim,outputDim,weights=[embedding_mat], trainable=False)(inputs)
    
    x = Conv1D(32,5,activation='relu')(x)
    x = MaxPooling1D()(x)

    x = Conv1D(64,5,activation='relu')(x)
    x = MaxPooling1D()(x)

    x = Dropout(0.2)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)

    outputs = Dense(class_num, activation='softmax')(x)

    model = Model(inputs,outputs)
    # 损失函数使用交叉熵
    model.compile(loss='categorical_crossentropy',
                  optimizer='RMSprop',
                  metrics=['accuracy'])
    model.summary()
    model.fit(x_train,y_train,epochs=Epoch,batch_size=BATCH_SIZE,validation_data=(x_val,y_val))
    model.save('../ckpt/cnn.h5')
    return model

In [37]:
cn_model = train_cnn_model(x_train,y_train,x_val, y_val,embedding_mat)

inputLength=45
inputDim=7677,outputDim=256
class_num=34
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 45)                0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 45, 256)           1965312   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 41, 32)            40992     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 20, 32)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 16, 64)            10304     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
dropout_4 (Dropout) 

In [131]:
from keras.models import Model
from keras.layers import Input,Dense, Dropout,Flatten,Conv2D,MaxPooling2D

def train_cnn_model(x_train, y_train,x_val, y_val, embedding_mat):
    
    x_train = [[embedding_mat[w] for w in s] for s in x_train]
    x_train = np.array(x_train, dtype=np.float32)
    
    _,vocab_size,word2vec_size = x_train.shape
    x_train = x_train.reshape(-1,vocab_size,word2vec_size,1)
    _,class_num = y_train.shape
    
    x_val = [[embedding_mat[w] for w in s] for s in x_val]
    x_val = np.array(x_val, dtype=np.float32)
    x_val = x_val.reshape(-1, vocab_size, word2vec_size, 1)
    
    inputs = Input(shape=(vocab_size,word2vec_size,1))   #width,height,channels

    x = Conv2D(32,(5,5),activation='relu')(inputs)
    x = MaxPooling2D()(x)

    x = Conv2D(64,(5,5),activation='relu')(x)
    x = MaxPooling2D()(x)

    x = Dropout(0.2)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)

    outputs = Dense(class_num, activation='softmax')(x)

    model = Model(inputs,outputs)
    # 损失函数使用交叉熵
    model.compile(loss='binary_crossentropy',
                  optimizer='RMSprop',
                  metrics=['accuracy'])
    model.summary()
    model.fit(x_train,y_train,epochs=Epoch,batch_size=BATCH_SIZE,validation_data=(x_val,y_val))
    model.save('../ckpt/cnn.h5')
    return model