In [None]:
# encoding=utf-8
import numpy as np
import pandas as pd
import os
import jieba
from datetime import datetime
import json

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

In [None]:
MAX_TEXTS = 20
MAX_SENTS = 20
MAX_SENT_LENGTH = 30
MAX_NB_WORDS = 20000

EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

real = pd.read_csv('./data//real_data.csv')
rumor = pd.read_csv('./data//rumor_data.csv')

real['category'] += (max(set(rumor['category'])) + 1)
rumor['label'] = 1
real['label'] = 0
data_train = pd.concat([rumor, real]).reset_index().drop(columns='index')

split_idx = len(rumor)
data_train.head()

## 预处理

In [None]:
def clean_text(text):
    #regex = re.compile(r'[^\u4e00-\u9fa5aA-Za-z0-9]')
    #text = regex.sub(' ', text)
    return text.strip()

def sent2words(sent):
    return [word for word in jieba.cut(sent) if word.strip()]

def text2words(text):
    sentences = re.split('(。|！|\!|\.|？|\?)',text)

    new_sents = []
    for i in range(int(len(sentences)/2)):
        sent = sentences[2*i] + sentences[2*i+1]
        words = sent2words(sent.strip())
        new_sents.append(words)
        #new_sents.append(sent.strip())
    return new_sents

In [None]:
events = {}

for idx in range(data_train.text.shape[0]):
    text = clean_text(data_train.text[idx])
    words = text2words(text)
    
    event = str(data_train.category[idx])
    if event not in events:
        events[event] = []
    
    events[event].append(words)

In [None]:
word_dict = {}
for event in events.values():
    for text in event:
        for sent in text:
            for word in sent:
                word_dict[word] = word_dict.get(word, 0) + 1
                
word_index = {}
index = 0
for i, j in sorted(word_dict.items(), key=lambda x:x[1], reverse=True):
    word_index[i] = index
    index += 1

In [None]:
data = np.zeros((len(events), MAX_TEXTS, MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, event in enumerate(events.values()):  
    for j, sentences in enumerate(event):
        if j < MAX_TEXTS:
            for k, sent in enumerate(sentences):
                if k < MAX_SENTS:
                    l = 0
                    for word in sent:
                        if l < MAX_SENT_LENGTH and word_index[word] < MAX_NB_WORDS:
                            data[i, j, k, l] = word_index[word]
                            l = l + 1

## 生成训练、测试集

In [None]:
def get_side_info(data):
    append_data = []
    for cate, df in data.groupby('category'):
        #print(cate)
        category_atte = []
        title_len = np.array([len(title) for title in df['title']])
        category_atte.extend([title_len.max(), title_len.min(), title_len.mean(), title_len.var()])
        sent_count = np.array([len(re.split('(。|！|\!|\.|？|\?)',text)) for text in df['text']])
        category_atte.extend([sent_count.max(), sent_count.min(), sent_count.mean(), sent_count.var()])
        text_len = np.array([len(text) for text in df['text']])
        category_atte.extend([text_len.max(), text_len.min(), text_len.mean(), text_len.var()])

        num_count = np.array([len(re.findall('\d+',text)) for text in df['text']])
        category_atte.extend([num_count.max(), num_count.min(), num_count.mean(), num_count.var()])
        question_mark_count = np.array([text.count('?')+text.count('？') for text in df['text']])
        category_atte.extend([question_mark_count.max(), question_mark_count.min(), question_mark_count.mean(), question_mark_count.var()])
        exclamation_mark_count = np.array([text.count('!')+text.count('！') for text in df['text']])
        category_atte.extend([exclamation_mark_count.max(), exclamation_mark_count.min(), exclamation_mark_count.mean(), exclamation_mark_count.var()])

        text_count = len(df['text'])
        category_atte.append(text_count)

        times = np.array([datetime.strptime(time, '%Y-%m-%d') for time in df['time']])
        time_span = (max(times) - min(times)).days
        mean_time_span = time_span / len(times)
        category_atte.extend([time_span, mean_time_span])

        month_propagation_count = sum([1 for t in times if (max(times)-t).days<30])
        half_year_propagation_count = sum([1 for t in times if (max(times)-t).days<180])
        year_propagation_count = sum([1 for t in times if (max(times)-t).days<365])
        three_year_propagation_count = sum([1 for t in times if (max(times)-t).days<365*3])
        category_atte.extend([month_propagation_count,half_year_propagation_count,year_propagation_count,three_year_propagation_count])

#         word1_count = np.array([text.count('震惊') for text in df['text']])
#         category_atte.extend([word1_count.max(), word1_count.min(), word1_count.mean(), word1_count.var()])
#         word2_count = np.array([text.count('健康') for text in df['text']])
#         category_atte.extend([word2_count.max(), word2_count.min(), word2_count.mean(), word2_count.var()])
#         word3_count = np.array([text.count('癌症') for text in df['text']])
#         category_atte.extend([word3_count.max(), word3_count.min(), word3_count.mean(), word3_count.var()])
#         word4_count = np.array([text.count('二维码') for text in df['text']])
#         category_atte.extend([word4_count.max(), word4_count.min(), word4_count.mean(), word4_count.var()])
#         word5_count = np.array([text.count('转账') for text in df['text']])
#         category_atte.extend([word5_count.max(), word5_count.min(), word5_count.mean(), word5_count.var()])
#         word6_count = np.array([text.count('中医') for text in df['text']])
#         category_atte.extend([word6_count.max(), word6_count.min(), word6_count.mean(), word6_count.var()])

#         word7_count = np.array([text.count('危害') for text in df['text']])
#         category_atte.extend([word7_count.max(), word7_count.min(), word7_count.mean(), word7_count.var()])
#         word8_count = np.array([text.count('指数') for text in df['text']])
#         category_atte.extend([word8_count.max(), word8_count.min(), word8_count.mean(), word8_count.var()])
#         word9_count = np.array([text.count('孩子') for text in df['text']])
#         category_atte.extend([word9_count.max(), word9_count.min(), word9_count.mean(), word9_count.var()])
#         word10_count = np.array([text.count('来源') for text in df['text']])
#         category_atte.extend([word10_count.max(), word10_count.min(), word10_count.mean(), word10_count.var()])
#         word11_count = np.array([text.count('记者') for text in df['text']])
#         category_atte.extend([word11_count.max(), word11_count.min(), word11_count.mean(), word11_count.var()])
        append_data.append(category_atte)
    append_data = np.array(append_data)
    return append_data
side_data = get_side_info(data_train)

In [None]:
labels = np.concatenate([np.ones(187),np.zeros(210)])
labels = to_categorical(np.asarray(labels))

print('Total %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)
print('Shape of side data tensor:', side_data.shape)
print('Shape of label tensor:', labels.shape)

SIDE_INFO_DIM = side_data.shape[1]
indices = np.arange(data.shape[0])
np.random.shuffle(indices, )
data = data[indices]
side_data = side_data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
x_side_train = side_data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
x_side_val = side_data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

## 中文embedding

In [None]:
embeddings_index = {}
f = open('./data//sgns.weibo.word', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print("embedding_matrix: " + str(embedding_matrix.shape))

embedding_layer = Embedding(
    len(word_index) + 1, EMBEDDING_DIM,
    weights=[embedding_matrix], input_length=MAX_SENT_LENGTH,
    trainable=True, mask_zero=True)

## attention层

In [None]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()
        
    # 定义权重
    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        #return mask
        return None
    
    # 定义功能逻辑
    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, h_dim]
        # size of W :[h_dim, attention_dim]
        # uit = tanh(xW+b)
        # size of uit :[batch_size, sel_len, attention_dim]
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        # size of u :[attention_dim, 1]
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)   #batch_size, sel_len

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)                 #batch_size, sel_len, 1
        weighted_input = x * ait                 #batch_size, sel_len, h_dim
        output = K.sum(weighted_input, axis=1)   #batch_size, h_dim

        return output
    
    # 定义形状变化的逻辑
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


## model

In [None]:
#word level
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)  #双向GRU的output dim h=2*128
l_att_word = AttLayer(100)(l_lstm_word)                                           #output dim h=2*128
sent_model = Model(sentence_input, l_att_word)

#sentence level
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sent_model)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_att_sent = AttLayer(100)(l_lstm_sent)
review_model = Model(review_input, l_att_sent)

#post level
event_input = Input(shape=(MAX_TEXTS, MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
event_encoder = TimeDistributed(review_model)(event_input)
l_lstm_event = Bidirectional(GRU(100, return_sequences=True))(event_encoder)
l_att_event = AttLayer(100)(l_lstm_event)

preds = Dense(2, activation='softmax')(l_att_event)
model = Model(event_input, preds)


#side_input = Input(shape=(SIDE_INFO_DIM,), dtype='float32')
#side_vec = Dense(50, activation='tanh')(side_input)
#concated_vec = concatenate([l_att_event, side_vec])
#concated_vec = Dense(20, activation='tanh')(concated_vec)
#preds = Dense(2, activation='softmax')(concated_vec)
#model = Model([event_input, side_input], preds)

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['acc',metrics.Precision(),metrics.Recall()])

In [None]:
print(model.summary())

In [None]:
model.fit([x_train, x_side_train], y_train, validation_data=([x_val, x_side_val], y_val),
          epochs=50, batch_size=10)