In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Concatenate, Dropout, BatchNormalization,GRU,LSTM,Conv1D,MaxPool1D,Flatten,Lambda
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.data_utils import get_file
from keras import backend as K
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
que = pd.read_csv('./data/question.csv')

In [3]:
word_dict = {}
char_dict = {}

In [4]:
with open('./data/word_embed.txt') as f:
    for line in f.readlines():
        s = line.strip('\n').split(' ')
        word_dict[s[0]] = [float(v) for v in s[1:]]

In [5]:
with open('./data/char_embed.txt') as f:
    for line in f.readlines():
        s = line.strip('\n').split(' ')
        char_dict[s[0]] = [float(v) for v in s[1:]]

In [6]:
train = pd.merge(train,que[['qid','words']],left_on='q1',right_on='qid',how='left')
train = pd.merge(train,que[['qid','words']],left_on='q2',right_on='qid',how='left')
train.drop(['qid_x','qid_y'],axis=1,inplace=True)
train.columns = ['label','q1','q2','word1','word2']

In [7]:
test = pd.merge(test,que[['qid','words']],left_on='q1',right_on='qid',how='left')
test = pd.merge(test,que[['qid','words']],left_on='q2',right_on='qid',how='left')
test.drop(['qid_x','qid_y'],axis=1,inplace=True)
test.columns = ['q1','q2','word1','word2']

In [8]:
MAX_NB_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(que['words'])
word_index = tokenizer.word_index

In [9]:
q1_train = tokenizer.texts_to_sequences(train['word1'])
q2_train = tokenizer.texts_to_sequences(train['word2'])

In [10]:
q1_test = tokenizer.texts_to_sequences(test['word1'])
q2_test = tokenizer.texts_to_sequences(test['word2'])

In [11]:
#构建embedding层
EMBEDDING_DIM = 300
word_embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = word_dict.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [12]:
#构建输入张量
MAX_SEQUENCE_LENGTH = 25
q1_data_tr = pad_sequences(q1_train,maxlen=MAX_SEQUENCE_LENGTH)
q2_data_tr = pad_sequences(q2_train,maxlen=MAX_SEQUENCE_LENGTH)
q1_data_te = pad_sequences(q1_test,maxlen=MAX_SEQUENCE_LENGTH)
q2_data_te = pad_sequences(q2_test,maxlen=MAX_SEQUENCE_LENGTH)

In [13]:
q_concat = np.stack([q1_data_tr,q2_data_tr],axis=1)

In [14]:
re = []
from sklearn.model_selection import StratifiedKFold
for tr,va in StratifiedKFold(n_splits=10).split(q_concat,train['label'].values):   
    Q1_train = q_concat[tr][:,0]
    Q2_train = q_concat[tr][:,1]
    Q1_test = q_concat[va][:,0]
    Q2_test = q_concat[va][:,1]
    #构建embedding层，q1 和 q2共享此embedding层
    embedding_layer = Embedding(MAX_NB_WORDS+1,
            EMBEDDING_DIM,
            weights=[word_embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    #词嵌入
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    #gru
    gru = GRU(128, return_sequences=True)
    q1 = gru(embedded_sequences_1)
    q2 = gru(embedded_sequences_2)
    q1 = Lambda(lambda x:K.reshape(x,(-1,25,128)))(q1)
    q2 = Lambda(lambda x:K.reshape(x,(-1,25,128)))(q2)
    #用类似TextCNN的思路构建不同卷积核的特征，两个句子共用同样的卷积层
    kernel_size = [2,3,4,5]
    conv_concat = []
    for kernel in kernel_size:
        conv = Conv1D(32,kernel_size=kernel,activation='relu',padding='same')
        q1_conv = conv(q1)
        q1_maxp = MaxPool1D(pool_size=25)(q1_conv)
#         q1_meanp = MeanPool1D(pool_size=25)(q1_conv)
        q2_conv = conv(q2)
        q2_maxp = MaxPool1D(pool_size=25)(q2_conv)
#         q1_meanp = MeanPool1D(pool_size=25)(q1_conv)
        conv_concat.append(Concatenate()([q1_maxp,q2_maxp]))
    conv = Concatenate()(conv_concat)
    merged = Dropout(0.2)(Flatten()(conv))
    merged = BatchNormalization()(merged)
    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.2)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[sequence_1_input, sequence_2_input],outputs=preds)
    model.compile(loss='binary_crossentropy',
            optimizer='nadam',
            metrics=['acc'])
    hist = model.fit([Q1_train, Q2_train], train['label'].values[tr],validation_data=([Q1_test, Q2_test], train['label'].values[va]),epochs=10, batch_size=1024, shuffle=True)
    pred = model.predict([q1_data_te,q2_data_te],batch_size=1024)
    avg = [v[0] for v in pred]
    re.append(avg)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 228946 samples, validate on 25440 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228946 samples, validate on 25440 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228947 samples, validate on 25439 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228947 samples, validate on 25439 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 228948 samples, validate on 25438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
def make_submission(predict_prob):
    with open('submission.csv', 'w') as file:
        file.write(str('y_pre') + '\n')
        for line in predict_prob:
            file.write(str(line) + '\n')
    file.close()

In [17]:
make_submission(np.mean(re))

TypeError: 'numpy.float32' object is not iterable

In [23]:
np.array(re).mean(axis=0).shap

array([0.29814544, 0.28900117, 0.2894758 , 0.28705674, 0.3016859 ,
       0.27341452, 0.29639283, 0.2876494 , 0.2884639 , 0.29671726],
      dtype=float32)