In [26]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
import tensorflow as tf
from keras import initializers, regularizers, constraints, optimizers, layers

In [7]:
# # copy、移动文件
# from shutil import copyfile
# import shutil
# WORK_PATH = '/home/kesci/work/tfrecords_2'
# TFRECORDS_PATH = os.path.join(WORK_PATH, 'train')
# list_ = [filenames for dirpath, dirnames, filenames in os.walk(TFRECORDS_PATH)]
# num = 20
# for i in list_[0]:
#     # print(i)
#     if(num == 0):
#         break
#     num -= 1
#     copyfile(WORK_PATH+'/train/'+i, WORK_PATH+'/train_2/'+i)
# os.remove('/home/kesci/work/tfrecords_2/train/train.tfrecords-0092-of-1000')

In [8]:
from keras import backend as K
from sklearn.metrics import roc_auc_score
# AUC for a binary classifier
def auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    return FP/N
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P

In [9]:
embed_size = 300 # how big is each word vector
max_features = 85000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 230 # max number of words in a question to use
maxlen_1 = 115

In [27]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 230)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 230, 300)          25500000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 230, 128)          140544    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total para

In [11]:
# import shutil
# TFRECORDS_PATH = os.path.join(WORK_PATH, 'tfrecords_2')
# tfrecords_list = [os.path.join(dirpath, filename) for dirpath, dirnames, filenames in os.walk(TFRECORDS_PATH) \
#                                                         for filename in filenames]
# l = len(tfrecords_list)
# num = 0
# for i in tfrecords_list:
#     num += 1
#     if num < l:
#         shutil.move(i, '/home/kesci/work/tfrecords_2/train/')
#     else:
#         shutil.move(i, '/home/kesci/work/tfrecords_2/val/')

In [29]:
DATA_PATN = './input'
WORK_PATH = './work/tfrecords_2'
TFRECORDS_PATH = os.path.join(WORK_PATH, 'train')
TFRECORDS_VALPATH = os.path.join(WORK_PATH, 'val')
# TFRECORDS_PATH = os.path.join(WORK_PATH, 'tfrecords_2')

NUM_SHARDS = 20   # 总共写入多少文件
INSTANCES_PER_SHARD = 1000    # 每个文件写入多少数据

def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value=[value]))

def feature_2(x):
    p = list([int(i) for i in x.split(" ")])
    t = set(p)
    p.sort()
    if(len(p) == len(t)):
        return tf.train.Feature(int64_list = tf.train.Int64List(value=[len(p)])),tf.train.Feature(int64_list = tf.train.Int64List(value=[0]))
    dic = {}.fromkeys(t,0)
    for i in p:
        dic[i] += 1
    sorted(dic.items(),key = lambda x:x[1],reverse = True)
    for i in dic:
        return tf.train.Feature(int64_list = tf.train.Int64List(value=[len(p)])), tf.train.Feature(int64_list = tf.train.Int64List(value=[i]))

In [45]:
# 交 集 和 差 集 以 及 交 集 的 长 度，差 集 的 长 度
import re
def trans_list(x):
    t = x.strip().replace(',',' ')
    x = re.sub(r'\s+', ' ', t.strip())
    if(len(x) == 0):
        return []
    p = list([int(i.strip()) for i in x.split(' ')])
    return p
    
def differ(x, y):
    retA = trans_list(x)
    retB = trans_list(y)
    # 求差集，在retA中但不在retB中
    retC = list(set(retA).difference(set(retB)))
    retD = list(set(retA).difference(set(retC)))
    return retC,len(retC),retD,len(retD)
    
def parser(record):
    features = tf.parse_single_example(
        record,
        features = {
            'query_id': tf.FixedLenFeature([], tf.int64),
            'query_title_id': tf.FixedLenFeature([], tf.int64),
            'query_len': tf.FixedLenFeature([], tf.int64),
            'title_len': tf.FixedLenFeature([], tf.int64),
            'fre_query': tf.FixedLenFeature([], tf.int64),
            'fre_title': tf.FixedLenFeature([], tf.int64),
            'differQ_T': tf.FixedLenFeature([], tf.string),
            'differQ_T_len': tf.FixedLenFeature([], tf.int64),
            'unionT_Q': tf.FixedLenFeature([], tf.string),
            'unionT_Q_len': tf.FixedLenFeature([], tf.int64),
            'label': tf.FixedLenFeature([], tf.int64)
      })
    return features['query_id'], features['query_title_id'], features['query_len'], features['title_len'], features['fre_query'], features['fre_title'], features['differQ_T'], features['differQ_T_len'], features['unionT_Q'], features['unionT_Q_len'], features['label']
    
def get_example_object():
    pass

In [58]:
def read_data_from_train_files(path, batch_size):
    while True:
        tfrecords_list = [os.path.join(dirpath, filename) for dirpath, dirnames, filenames in os.walk(path) \
                                                        for filename in filenames]
        input_files = tf.placeholder(tf.string)
        dataset = tf.data.TFRecordDataset(input_files)
        dataset = dataset.map(parser)
        iterator = dataset.make_initializable_iterator()
        query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label = iterator.get_next()
        X = []
        Y = []
        
        index = 0
        instances_num = 0
        tokenizer = Tokenizer(num_words = max_features)
        cnt = 0
        differ_qt = []
        union_tq = []
        with tf.Session() as sess:
            sess.run(iterator.initializer, feed_dict={input_files: tfrecords_list})
            while True:
                try:
                    # Query_Id, Query_Title_Id, Query_len, Title_len, Query_fre, Title_fre, differQ_T, differQ_T_len, unionT_Q, unionT_Q_len, y = sess.run([query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label])
                    _, _, Query_len, Title_len, _, _, differQ_T, _, unionT_Q, _, y = sess.run([query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label])
                    
                    p1 = str(differQ_T.decode())
                    p2 = str(unionT_Q.decode())
                    if(len(p1) != 0):
                        p1 = p1[1:-1]
                        
                    if(len(p2) != 0):
                        p2 = p2[1:-1]
                    # print(p1,p2,type(p1),type(p2))
                    differ_qt.append(p1)
                    union_tq.append(p2)
                    
                    # print(p1,'-'*10,p2)
                    # temp = []
                    # temp.extend([Query_Id, Query_Title_Id, Query_len, Title_len, Query_fre, Title_fre])
                    
                    # temp.extend([i for i in differ_q_t[0]])
                    # temp.extend([i for i in union_t_q[0]])
                    # X.append(temp)
                    Y.append(y)
                    
                    #----------here-----------------
                    cnt += 1
                    if cnt == batch_size:
#                         print(cnt,'------------train\n')
                        cnt = 0
                        tokenizer.fit_on_texts(differ_qt)
                        differ_list_q = tokenizer.texts_to_sequences(differ_qt)
                        differ_q_t = pad_sequences(np.array(differ_list_q), maxlen=maxlen_1)
                        
                        tokenizer.fit_on_texts(union_tq)
                        union_list_q = tokenizer.texts_to_sequences(union_tq)
                        union_q_t = pad_sequences(np.array(union_list_q), maxlen=maxlen_1)
                        
                        p = list(tokenizer.word_index.values())
                        # print(p[len(p)-1],'------------train')
                        del p
                        
                        for i in range(batch_size):
                            temp_p = list(differ_q_t[i])
                            temp_p.extend(list(union_q_t[i]))
                            X.append(temp_p)
                        yield (np.array(X), np.array(Y))
                        X = []
                        Y = []
                        differ_qt = []
                        union_tq = []
                except tf.errors.OutOfRangeError:
                    break
#             print(cnt,'------------train\n')
            cnt = 0
            tokenizer.fit_on_texts(differ_qt)
            differ_list_q = tokenizer.texts_to_sequences(differ_qt)
            differ_q_t = pad_sequences(np.array(differ_list_q), maxlen=maxlen_1)
            
            tokenizer.fit_on_texts(union_tq)
            union_list_q = tokenizer.texts_to_sequences(union_tq)
            union_q_t = pad_sequences(np.array(union_list_q), maxlen=maxlen_1)
            
            p = list(tokenizer.word_index.values())
            # print(p[len(p)-1],'------------train')
            del p
            
            for i in range(len(differ_q_t)):
                temp_p = list(differ_q_t[i])
                temp_p.extend(list(union_q_t[i]))
                X.append(temp_p)
            yield (np.array(X), np.array(Y))

In [59]:
def read_data_from_val_files(path, batch_size):
    while True:
        tfrecords_list = [os.path.join(dirpath, filename) for dirpath, dirnames, filenames in os.walk(path) \
                                                            for filename in filenames]
        input_files = tf.placeholder(tf.string)
        dataset = tf.data.TFRecordDataset(input_files)
        dataset = dataset.map(parser)
        iterator = dataset.make_initializable_iterator()
        query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label = iterator.get_next()
        X = []
        Y = []

        index = 0
        instances_num = 0
        tokenizer = Tokenizer(num_words=max_features)
        cnt = 0
        differ_qt = []
        union_tq = []
        with tf.Session() as sess:
            sess.run(iterator.initializer, feed_dict={input_files: tfrecords_list})
            while True:
                try:
                    # Query_Id, Query_Title_Id, Query_len, Title_len, Query_fre, Title_fre, differQ_T, differQ_T_len, unionT_Q, unionT_Q_len, y = sess.run([query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label])
                    _, _, Query_len, Title_len, _, _, differQ_T, _, unionT_Q, _, y = sess.run([query_id, query_title_id, query_len, title_len, fre_query, fre_title, DifferQ_T, DifferQ_T_len, UnionT_Q, UnionT_Q_len, label])

                    p1 = str(differQ_T.decode())
                    p2 = str(unionT_Q.decode())
                    if(len(p1) != 0):
                        p1 = p1[1:-1]

                    if(len(p2) != 0):
                        p2 = p2[1:-1]
                    # print(p1,p2,type(p1),type(p2))
                    differ_qt.append(p1)
                    union_tq.append(p2)

                    # print(p1,'-'*10,p2)
                    # temp = []
                    # temp.extend([Query_Id, Query_Title_Id, Query_len, Title_len, Query_fre, Title_fre])

                    # temp.extend([i for i in differ_q_t[0]])
                    # temp.extend([i for i in union_t_q[0]])
                    # X.append(temp)
                    Y.append(y)
                    cnt += 1
                    if cnt == batch_size:
#                         print(cnt,'------------val\n')
                        cnt = 0
                        tokenizer.fit_on_texts(differ_qt)
                        differ_list_q = tokenizer.texts_to_sequences(differ_qt)
                        differ_q_t = pad_sequences(np.array(differ_list_q), maxlen=maxlen_1)
                        # print(differ_qt, differ_q_t)
                        tokenizer.fit_on_texts(union_tq)
                        union_list_q = tokenizer.texts_to_sequences(union_tq)
                        union_q_t = pad_sequences(np.array(union_list_q), maxlen=maxlen_1)

                        p = list(tokenizer.word_index.values())
                        # print(p[len(p)-1],'------------val')
                        del p

                        for i in range(batch_size):
                            temp_p = list(differ_q_t[i])
                            temp_p.extend(list(union_q_t[i]))
                            X.append(temp_p)
#                         return np.array(X), np.array(Y)
                        yield np.array(X), np.array(Y)
                        X = []
                        Y = []
                        differ_qt = []
                        union_tq = []

                except tf.errors.OutOfRangeError:
                    break
            cnt = 0
            tokenizer.fit_on_texts(differ_qt)
            differ_list_q = tokenizer.texts_to_sequences(differ_qt)
            differ_q_t = pad_sequences(np.array(differ_list_q), maxlen=maxlen_1)
            
            tokenizer.fit_on_texts(union_tq)
            union_list_q = tokenizer.texts_to_sequences(union_tq)
            union_q_t = pad_sequences(np.array(union_list_q), maxlen=maxlen_1)
            
            p = list(tokenizer.word_index.values())
            # print(p[len(p)-1],'------------train')
            del p
            
            for i in range(len(differ_q_t)):
                temp_p = list(differ_q_t[i])
                temp_p.extend(list(union_q_t[i]))
                X.append(temp_p)
            yield (np.array(X), np.array(Y))

In [60]:
batch_size_ = 512
valGen = read_data_from_val_files(TFRECORDS_VALPATH, batch_size_//2)
# x_test, y_test = read_data_from_val_files(TFRECORDS_VALPATH, 100000)

Exception ignored in: <generator object read_data_from_val_files at 0x7ff0c00650f8>
Traceback (most recent call last):
  File "<ipython-input-53-ab52c9718097>", line 90, in read_data_from_val_files
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1587, in __exit__
    self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
  File "/home/wjw/anaconda3/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 5233, in get_controller
    context.context().context_switches.pop()
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/context.py", line 141, in pop
    self.stack.pop()
IndexError: pop from empty list


In [61]:
def read_data_from_train_num_files(path):
    tfrecords_list = [filenames for dirpath, dirnames, filenames in os.walk(path)]
    # print(tfrecords_list)
    total = 0
    for fn in tfrecords_list[0]:
        # print(fn,'-----')
        cnt = 0
        for record in tf.python_io.tf_record_iterator(TFRECORDS_PATH+'/'+fn):
            cnt += 1 
        total += cnt
#         print(cnt)
    return total
NUM_TOTAL = read_data_from_train_num_files(TFRECORDS_PATH)
print(NUM_TOTAL)

18000


In [62]:
batch_size_ = 512
initial_epoch = 0
trainGen = read_data_from_train_files(TFRECORDS_PATH, batch_size_)
# trainGen_ = trainGen
# num = 0
# print(NUM_TOTAL//batch_size_)
# while(num <= NUM_TOTAL//batch_size_):
#     next(trainGen_)
#     num += 1
# print(num)
# trainGen = read_data_from_files(TFRECORDS_PATH, 100)

Exception ignored in: <generator object read_data_from_train_files at 0x7fef636dd468>
Traceback (most recent call last):
  File "<ipython-input-52-776ca226ac47>", line 91, in read_data_from_train_files
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1587, in __exit__
    self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
  File "/home/wjw/anaconda3/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 5233, in get_controller
    context.context().context_switches.pop()
  File "/home/wjw/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/context.py", line 141, in pop
    self.stack.pop()
IndexError: pop from empty list


In [63]:
model.fit_generator(trainGen, initial_epoch=initial_epoch,steps_per_epoch = math.ceil(NUM_TOTAL/batch_size_), epochs = 2, validation_steps = math.ceil(2000/batch_size_/2),validation_data = valGen, verbose=True)
# model.fit(train_X, train_y, batch_size = 512, epochs = 20, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff0ac486668>

In [21]:
model.save('model.h5') # HDF5文件, pip install h5py
del model

In [None]:
from keras.models import load_model
model = load_model('model.h5', custom_objects={'auc':auc})

In [None]:
test_data = 
result_data = model.predict(tets_data,batch_size=512, verbose=1)