In [None]:
import codecs
from tqdm import tqdm
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import gensim
import re
import jieba

from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

## 准备数据

In [None]:
import os 
if not os.path.exists('output'):
    os.makedirs('output')
if not os.path.exists('output/data_clean_split.txt'):
  !wget -P ./output https://raw.githubusercontent.com/foochane/text-classification/master/output/data_clean_split.txt

# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 切分数据集
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(text, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

#训练word2vec模型
model = gensim.models.Word2Vec(text_s2w,
                               min_count=5,
                               workers=6,
                               window =8,
                               size=100)

#该函数会将语句转化为一个标准化的向量（Normalized Vector）
def sent2vec(s):
    """
    将每个句子转换会一个100的向量
    """
    words = s.split()
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(model[w])
        except:
            continue
    M = np.array(M)  # shape=(x,100),x是句子中词的个数，100是每个词向量的维数
    v = M.sum(axis=0) # 维度是100，对M中的x个数求和，得到每一维度的总和
    if type(v) != np.ndarray: 
        return np.zeros(100)
    
    return v / np.sqrt((v ** 2).sum()) # 正则化，最后每个句子都变为一100维的向量

# 对训练集和验证集使用上述函数，进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

#构建词建词嵌入字典
embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))


# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
# 在使用神经网络前，对数据进行缩放
scl = preprocessing.StandardScaler()
x_train_w2v_scl = scl.fit_transform(x_train_w2v)
x_valid_w2v_scl = scl.transform(x_valid_w2v)

# 对标签进行binarize处理
y_train_enc = np_utils.to_categorical(y_train)
y_valid_enc = np_utils.to_categorical(y_valid)


In [None]:
y_train_enc[0]

In [None]:
x_valid_w2v.shape

## 全连接网络

In [None]:
#创建1个3层的序列神经网络（Sequential Neural Net）
model = Sequential()

model.add(Dense(300, input_dim=100, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(19))
model.add(Activation('softmax'))

# 模型编译
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(x_train_w2v_scl, 
          y=y_train_enc, 
          batch_size=64, 
          epochs=5, 
          verbose=1, 
          validation_data=(x_valid_w2v_scl, y_valid_enc))

## 使用LSTM

In [None]:
# 使用 keras tokenizer
token = keras.preprocessing.text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(x_train) + list(x_valid))
x_train_seq = token.texts_to_sequences(x_train)
x_valid_seq = token.texts_to_sequences(x_valid)

#对文本序列进行zero填充
x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=max_len)
x_valid_pad = sequence.pad_sequences(x_valid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
#基于已有的数据集中的词汇创建一个词嵌入矩阵（Embedding Matrix）
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# 基于前面训练的Word2vec词向量，使用1个两层的LSTM模型
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(x_train_pad, 
          y=y_train_enc, 
          batch_size=512, 
          epochs=100, 
          verbose=1, 
          validation_data=(x_valid_pad, y_valid_enc), 
          callbacks=[earlystop])

## 使用双向长短时记忆(Bi-Directional LSTM)

In [None]:
# 基于前面训练的Word2vec词向量，构建1个2层的Bidirectional LSTM 
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(x_train_pad, 
          y=y_train_enc, 
          batch_size=512, 
          epochs=100, 
          verbose=1, 
          validation_data=(x_valid_pad, y_valid_enc), 
          callbacks=[earlystop])

## 使用GRU

In [None]:
# 基于前面训练的Word2vec词向量，构建1个2层的GRU模型
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [None]:
#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(x_train_pad, 
          y=y_train_enc, 
          batch_size=512, 
          epochs=100, 
          verbose=1, 
          validation_data=(x_valid_pad, y_valid_enc), 
          callbacks=[earlystop])