In [21]:
import gc
import os
import nltk
import tqdm
import numpy as np
import pandas as pd
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/chenpeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.engine import Layer
from keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten
from keras.layers import concatenate, GRU, Input, K, LSTM, MaxPooling1D
from keras.layers import GlobalAveragePooling1D,  GlobalMaxPooling1D, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing import text, sequence
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

In [23]:
#capsule 的参数

gru_len = 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.3
rate_drop_dense = 0.3

batch_size = 128 # 256
recurrent_units = 16 # 64
dropout_rate = 0.3 
dense_size = 8 # 32
sentences_length = 10 # 300
fold_count = 2 # 10

In [24]:
train_file_path = "/home/chenpeng/project/Capsule_for_toxic/data/train_small.csv"
test_file_path = "/home/chenpeng/project/Capsule_for_toxic/data/test_small.csv"
embedding_path = "/home/chenpeng/project/Capsule_for_toxic/embedding/embeddings_small.vec"
#embedding_path 文件是每个单词（包含字符）对应的300维度的特征  一共100个单词（字符）


In [25]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"
CLASSES = ["project_is_approved"]

print("加载数据")
train_data = pd.read_csv(train_file_path)
# print(train_data.head())
test_data = pd.read_csv(test_file_path)
# print(test_data.head())

list_sentences_train = train_data["application_text"].fillna(NAN_WORD).values
# print(list_sentences_train[:3])#返回的是前三行的信息（一行是一个句子）  以列表的形式
list_sentences_test = test_data["application_text"].fillna(NAN_WORD).values

y_train = train_data[CLASSES].values#取值0/1 以列表的形式
# print(y_train)

加载数据


In [26]:
def tokenize_sentences(sentences, words_dict):
    """
    tokenized_sentences:返回的是每个句子的标注  将每个句子标注化
    word_dict:返回的是{单词：索引，单词：索引...}
    """
    tokenized_sentences = []
    for sentence in sentences:
        tokens = nltk.tokenize.word_tokenize(sentence)
        result = []
        for word in tokens:
            word = word.lower()
            if word not in words_dict:
                words_dict[word] = len(words_dict)#单词词典，{单词：索引}
            word_index = words_dict[word]#每个句子的单词索引
            result.append(word_index)#获得一个句子的单词索引
        tokenized_sentences.append(result)
    return tokenized_sentences, words_dict

In [27]:
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

Tokenizing sentences in train set...


In [28]:
print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

Tokenizing sentences in test set...


In [29]:
words_dict[UNKNOWN_WORD] = len(words_dict) #向词典中添加词典之外的词

In [30]:
def read_embedding_list(file_path):
    """
    file_path:嵌入文件路径
    embedding_list:返回的是嵌入单词的向量列表
    embedding_word_dict：返回的是嵌入的单词的词典  {单词：索引}
    
    """

    embedding_word_dict = {}
    embedding_list = []
    f = open(file_path)

    for index, line in enumerate(f):
        if index == 0:#第一行跳过
            continue
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            continue
        embedding_list.append(coefs)
        embedding_word_dict[word] = len(embedding_word_dict)
    f.close()
    embedding_list = np.array(embedding_list)
    return embedding_list, embedding_word_dict

In [31]:
print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(embedding_path)

Loading embeddings...


In [32]:
embedding_list , embedding_word_dict = read_embedding_list(embedding_path)
# print(embedding_list)
# print(embedding_word_dict)
embedding_size = len(embedding_list[0])#嵌入维度300维度
# print(embedding_size)#300

In [33]:
def clear_embedding_list(embedding_list, embedding_word_dict, words_dict):
    """
    words_dict:是训练集的词典
    embedding_word_dict：嵌入单词的词典
    embedding_list:嵌入单词对应的300维度的向量
    
    #该函数的作用是清除训练集的词典中的单词不在嵌入词典中 从而根据嵌入词典得到对应的嵌入向量列表
    #即根据嵌入向量  得到对应的训练集的嵌入
    """
    cleared_embedding_list = []
    cleared_embedding_word_dict = {}

    for word in words_dict:
        if word not in embedding_word_dict:#若果训练集的词典不在嵌入词典中 则 跳过当前操作
            continue
        word_id = embedding_word_dict[word]
        row = embedding_list[word_id]
        cleared_embedding_list.append(row)
        cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)

    return cleared_embedding_list, cleared_embedding_word_dict

In [34]:
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)#向嵌入词典中添加 UNKNOWN_WORD
embedding_list.append([0.] * embedding_size)#UNKNOWN_WORD 对应的嵌入向量 为0 
embedding_word_dict[END_WORD] = len(embedding_word_dict)##向嵌入词典中添加 END_WORD
embedding_list.append([-1.] * embedding_size)#END_WORD 对应的为-1

embedding_matrix = np.array(embedding_list)#得到嵌入矩阵
print(np.shape(embedding_matrix))#embedding_matrix :80 300

Preparing data...
(79, 300)


In [35]:
id_to_word = dict((id, word) for word, id in words_dict.items())

In [36]:
def convert_tokens_to_ids(tokenized_sentences, words_list, embedding_word_dict, sentences_length):
    """
    tokenized_sentences:标注化的句子列表
    words_list：索引到单词 （根据训练集的dic）
    embedding_word_dict:嵌入矩阵的词典
    sentences_length:句子的长度 设为10
    """
    words_train = []

    for sentence in tokenized_sentences:
        current_words = []
        for word_index in sentence:
            word = words_list[word_index]
            word_id = embedding_word_dict.get(word, len(embedding_word_dict) - 2)
            current_words.append(word_id)

        if len(current_words) >= sentences_length:
            current_words = current_words[:sentences_length]
        else:
            current_words += [len(embedding_word_dict) - 1] * (sentences_length - len(current_words))
        words_train.append(current_words)
    return words_train

In [37]:
train_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_train,
    id_to_word,
    embedding_word_dict,
    sentences_length)

test_list_of_token_ids = convert_tokens_to_ids(
    tokenized_sentences_test,
    id_to_word,
    embedding_word_dict,
    sentences_length)

X_train = np.array(train_list_of_token_ids)
X_test = np.array(test_list_of_token_ids)

In [38]:
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

capsuleLayer

In [39]:
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

""获取模型""

In [40]:
def get_model(embedding_matrix, sequence_length, dropout_rate, recurrent_units, dense_size):
    input1 = Input(shape=(sequence_length,))
    embed_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                                weights=[embedding_matrix], trainable=False)(input1)
    embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)

    x = Bidirectional(
        GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(
        embed_layer)
    capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
                      share_weights=True)(x)
    capsule = Flatten()(capsule)
    capsule = Dropout(dropout_p)(capsule)
    output = Dense(1, activation='sigmoid')(capsule)
    model = Model(inputs=input1, outputs=output)
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    model.summary()
    return model

In [41]:
get_model(embedding_matrix,
    sentences_length,
    dropout_rate,
    recurrent_units,
    dense_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 300)           23700     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 10, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 256)           329472    
_________________________________________________________________
capsule_1 (Capsule)          (None, 10, 16)            40960     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 160)               0         
__________

<keras.engine.training.Model at 0x7fe0d00fd550>

In [42]:
get_model_func = lambda: get_model(
    embedding_matrix,
    sentences_length,
    dropout_rate,
    recurrent_units,
    dense_size)

In [43]:
print(get_model_func)

<function <lambda> at 0x7fe0d8bbf950>


In [44]:
def _train_model(model, batch_size, train_x, train_y, val_x, val_y):
    """
    
    """
    num_labels = train_y.shape[1]
    patience = 5
    best_loss = -1
    best_weights = None
    best_epoch = 0
    
    current_epoch = 0
    
    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epochs=1)
        y_pred = model.predict(val_x, batch_size=batch_size)

        total_loss = 0
        for j in range(num_labels):
            loss = log_loss(val_y[:, j], y_pred[:, j])
            total_loss += loss

        total_loss /= num_labels

        print("Epoch {0} loss {1} best_loss {2}".format(current_epoch, total_loss, best_loss))

        current_epoch += 1
        if total_loss < best_loss or best_loss == -1:
            best_loss = total_loss
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == patience:
                break

    model.set_weights(best_weights)
    return model

In [45]:
def train_folds(X, y, X_test, fold_count, batch_size, get_model_func):
    print("="*75)
    fold_size = len(X) // fold_count
    models = []
    result_path = "predictions"
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = np.array(X[fold_start:fold_end])
        val_y = np.array(y[fold_start:fold_end])

        model = _train_model(get_model_func(), batch_size, train_x, train_y, val_x, val_y)
        train_predicts_path = os.path.join(result_path, "train_predicts{0}.npy".format(fold_id))
        test_predicts_path = os.path.join(result_path, "test_predicts{0}.npy".format(fold_id))
        train_predicts = model.predict(X, batch_size=512, verbose=1)
        test_predicts = model.predict(X_test, batch_size=512, verbose=1)
        np.save(train_predicts_path, train_predicts)
        np.save(test_predicts_path, test_predicts)

    return models

In [47]:
# train model

models = train_folds(X_train, y_train, X_test, fold_count, batch_size, get_model_func)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 10, 300)           23700     
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 10, 300)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 10, 256)           329472    
_________________________________________________________________
capsule_4 (Capsule)          (None, 10, 16)            40960     
_________________________________________________________________
flatten_4 (Flatten)          (None, 160)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 160)               0         
__________