In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt

import pickle
from gensim.models import word2vec
import random

from sklearn.preprocessing import StandardScaler
%matplotlib inline

### Step 0. Loading dataset

#### Step 0.1 load article cutted and article df and define y

In [0]:
with open("../article_cutted", "rb") as file:
    docs = pickle.load(file)

In [0]:
df = pd.read_csv('../data/article_preprocessed.csv')
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()
df['type'] = np.clip(df['push']-df['boo'], 0, 1)

#### Step 0.2 create word id mapping and word vector

In [0]:
w2v = word2vec.Word2Vec.load('../word2vec_model/CBOW')

In [0]:
word2id = {k:i for i, k in enumerate(w2v.wv.vocab.keys())}
id2word = {i:k for k, i in word2id.items()}

In [0]:
words_len = len(word2id)

In [0]:
embedding = np.zeros((words_len+1, 256))
for k, v in word2id.items():
    embedding[v] = w2v.wv[k]

#### Step 0.3 sentence to seq transform

In [0]:
input_length = 80
docs_id = []
for doc in docs:
    text = doc[:input_length]
    ids = [words_len+1]*input_length
    ids[:len(text)] = [word2id[w] if w in word2id else words_len+1 for w in text]

    docs_id.append(ids)


In [0]:
print(docs[0])
print(docs_id[0])

['韓瑜', '協志', '前妻', '正', '女演員', '周子', '瑜', 'TWICE', '團裡裡面', '台灣', '人', '正', '兩個', '要當', '鄉民', '老婆', '選', '五樓', '真', '勇氣']
[0, 1, 2, 3, 4, 5, 6, 7, 100035, 8, 9, 3, 10, 11, 12, 13, 14, 15, 16, 17, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035, 100035]


### Step 1. Data preprocessing

#### Step 1.1 Creating Training and Testing sets and creating generator

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, stratify=df['type'])

In [0]:
def train_data_generator(df, bz, docs_id):
    # bz: batch size 
    
    dfs = [sub_df for key,sub_df in df.groupby('type')]
    df_n = len(dfs)
    
    docs_id = np.array(docs_id)
    while True:
        selected = pd.concat([sub_df.sample(int(bz/2)) for sub_df in dfs], axis=0)
        selected = selected.sample(frac=1)
        x = docs_id[selected['idx']]
        y = selected.as_matrix(columns=['type'])
                    
        yield x, y
        
def test_data_generator(df, docs_id):
    docs_id = np.array(docs_id)
    x = docs_id[df['idx']]
    y = df.as_matrix(columns=['type'])

    return x, y

In [0]:
X_test, Y_test = test_data_generator(test, docs_id) 

### Let's create the RNN

In [0]:
epochs = 100
batch_size = 32
update_per_epochs = 100

learning_rate=0.001
hidden_layer_size=64
number_of_layers=1
dropout=True
dropout_rate=0.8
number_of_classes=1
gradient_clip_margin=4
wv=embedding

In [0]:
def LSTM_cell(hidden_layer_size, batch_size, number_of_layers, dropout=True, dropout_rate=0.8):
    def get_LSTM(hidden_layer_size, dropout, dropout_rate):
        layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)

        if dropout:
            layer = tf.contrib.rnn.DropoutWrapper(layer, output_keep_prob=dropout_rate)
            
        return layer
    
    cell = tf.contrib.rnn.MultiRNNCell([get_LSTM(hidden_layer_size, dropout, dropout_rate) for _ in range(number_of_layers)])

    init_state = cell.zero_state(batch_size, tf.float32)

    return cell, init_state

In [0]:
def output_layer(lstm_output, out_size):
    x = lstm_output[:, -1, :]
    output = tf.layers.dense(inputs= x, units= out_size, activation = tf.nn.sigmoid)
    return output

In [0]:
def opt_loss(logits, targets, learning_rate, grad_clip_margin):
    
    loss = tf.reduce_sum(tf.pow(logits - targets, 2))/batch_size
    
    #Cliping the gradient loss
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss)

    capped_gradients = [(tf.clip_by_value(grad, (-1)*grad_clip_margin, grad_clip_margin), var) for grad, var in gradients if grad is not None]
    
    train_optimizer = optimizer.apply_gradients(capped_gradients)

    
    return loss, train_optimizer

In [0]:
main_graph = tf.Graph()
sess = tf.Session(graph=main_graph)

with main_graph.as_default():
    
    ##defining placeholders##
    with tf.name_scope('input'):
        inputs = tf.placeholder(tf.int32, [None, input_length], name='input_data')
        targets = tf.placeholder(tf.float32, [None, 1], name='targets')
        bz = tf.placeholder(tf.int32, [], name='batch_size')
        
    ## embedding lookup table
    with tf.variable_scope('embedding'):    
        em_W = tf.Variable(wv.astype(np.float32), trainable=True)  #wv.shape = (100035, 256)
        x = tf.nn.embedding_lookup(em_W, inputs)    #x.shape = (?, 80, 256)
        
    ##LSTM layer##
    with tf.variable_scope("LSTM_layer"):
        cell, init_state = LSTM_cell(hidden_layer_size, tf.shape(inputs)[0], number_of_layers, dropout, dropout_rate) 
        outputs, states = tf.nn.dynamic_rnn(cell, x, initial_state=init_state)
    
    ##Output layer##   
    with tf.variable_scope('output_layer'):
        logits = output_layer(outputs, number_of_classes)
    
    ##loss and optimization##
    with tf.name_scope('loss_and_opt'):
        loss, opt = opt_loss(logits, targets, learning_rate, gradient_clip_margin)
    
    init = tf.global_variables_initializer()
    

### Time to train the network

In [0]:
sess.run(init)

In [0]:
from sklearn.metrics import roc_auc_score
train_generate = train_data_generator(train, batch_size, docs_id)

train_loss = []
train_auc = []
test_loss = []
test_auc = []
for i in range(epochs):
    traind_scores = []
    epoch_loss = []
    for j in range(update_per_epochs):
        X_batch, y_batch = next(train_generate) 
        
        o, c, _ = sess.run([logits, loss, opt], feed_dict={
            inputs:X_batch, 
            targets:y_batch,
            bz:np.array(batch_size)
        })
        
        epoch_loss.append(c)
        traind_scores.append(roc_auc_score(y_batch, o))
    
    to, tc = sess.run([logits, loss], feed_dict={
        inputs:X_test, 
        targets:Y_test,
        bz:np.array(len(X_test))
    })
    
    train_loss.append(np.mean(epoch_loss))
    train_auc.append(np.mean(traind_scores))
    test_loss.append(tc)
    test_auc.append(roc_auc_score(Y_test, to))
    
    if (i % 5) == 0:
        print('Epoch {}/{}'.format(i, epochs), ' Train loss: {}'.format(np.mean(epoch_loss)), 
              ' Train auc: {}'.format(np.mean(traind_scores)), 
             ' Test loss: {}'.format(tc), ' Test auc: {}'.format(roc_auc_score(Y_test, to)))
