# A Combined end-to-end training model for NLP classify problem

In [1]:
import numpy as np
import random
import threading
import warnings
import os
import sys
import nltk
sys.path.append('../../common/')

from train_log import train_log

os.environ["CUDA_VISIBLE_DEVICES"]="1,2" 
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.FATAL)

FILE_NAME = 'News_Category_Dataset_v2.json'

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/home/ubuntu/Notebooks/GloVe/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
    
import NewsCategoryData as ncd 
from NewsCategoryData import NewsCategoryTrainTestSet
from NewsCategoryData import LABEL_LIST
import Run_Prediction as rp

CONTINUE = 0
start_step = 0

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
batch_size = 64
state_size = 64
max_length = 100
n_classes = 41
word2vector_len = 100  
data = NewsCategoryTrainTestSet(batch_size=batch_size,max_length=max_length)
word_matrix_file = "naive_bayes_word_matrix_ver1.csv"
word_matrix, word_list = rp.read_naive_bayes_word_vector(word_matrix_file)

ckpt_dir = './ckpt_save_combine_model/'
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)
log_dir = './log_combine_model/'
log = train_log(log_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

Total 200847 recorders are read. 180787 train data and 20060 test data.
Load the naive bayes word vectors. It takes a few minutes.


In [3]:
with tf.variable_scope('input'):
    x_rnn = tf.placeholder(tf.float32, [None, max_length, word2vector_len], name="x_rnn")
    x_naive_bayes = tf.placeholder(tf.float32, [None, n_classes], name="x_naive_bayes")
    x_length = tf.placeholder(tf.int32, [None], name="x_length")
    y_ = tf.placeholder(tf.int64, [None],  name="y_")
    
    #RNN的初始化状态，全设为零。注意state是与input保持一致，接下来会有concat操作，所以这里要有batch的维度。即每个样本都要有隐层状态
    init_state = tf.zeros([batch_size, state_size], name="init_state")

with tf.variable_scope('rnn'):
    #定义rnn_cell的权重参数，
    cell = tf.contrib.rnn.BasicRNNCell(state_size, name = "cell")
    #rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_stateu
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, inputs = x_rnn, 
                                                 dtype=tf.float32,
                                                 #initial_state=init_state,
                                                 sequence_length = x_length)
with tf.variable_scope('fclayer'):
    w1 = tf.Variable(tf.random_uniform([state_size, int(state_size*9)],0,1.0),name="w1")
    b1= tf.Variable(tf.zeros([int(state_size*9)]),name="b1")
    hidden = tf.sigmoid(tf.add(tf.matmul(final_state, w1), b1),name="hidden")
    print("fclayer/hidden", hidden.shape)

with tf.variable_scope('rnn_output'):
    w2 = tf.Variable(tf.random_uniform([int(state_size*9), n_classes],0,1.0),name="w2")
    b2= tf.Variable(tf.zeros([n_classes]),name="b2")
    y_rnn_logits = tf.add(tf.matmul(hidden, w2), b2,name="y_rnn_logits")
    y_rnn = tf.nn.softmax(y_rnn_logits, name="y_rnn")
    print("rnn_output/y_rnn", y_rnn.shape)

with tf.variable_scope('combine_output'):   
    w = tf.Variable(tf.random_uniform([n_classes*2, n_classes],0,1.0),name="w")
    b= tf.Variable(tf.zeros([n_classes]),name="b")
    y = tf.add(tf.matmul(tf.concat([y_rnn,x_naive_bayes],axis=1), w), b,name="y")
    print("combine_output/y",y.shape)
            
with tf.variable_scope('loss'):
    cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,
                                                                                  labels=y_, 
                                                                                  name="cross_entropy_loss"))
    #cross_entropy_3 = tf.cast(tf.one_hot(y_-1,n_classes),tf.float32)*tf.log(tf.nn.softmax(y))
with tf.variable_scope('Prediction'):
    prediction = tf.argmax(tf.nn.softmax(y),1, name="prediction")
    cal_accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction,y_), tf.float32),name="cal_accuracy")
    
with tf.variable_scope('Train'):
    #train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    train_step = tf.train.AdamOptimizer(0.0001).minimize(cross_entropy)

fclayer/hidden (?, 576)
rnn_output/y_rnn (?, 41)
combine_output/y (?, 41)


In [4]:
saver = tf.train.Saver(max_to_keep = 1)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    if CONTINUE != 0:
        model_file=tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess,model_file)
    threads = tf.train.start_queue_runners(sess, coord)
    try:
        i = start_step
        avg_accuracy = 0
        avg_loss = 0
        while i < 100000:    
            batch_data, batch_label = data.batch_train_set()  
            batch_data_vec, data_length = ncd.word2vector(batch_data, model,max_length)
            naive_bayes_input = rp.naive_bayes_model(batch_data,word_matrix, word_list)
            y, loss, accuracy, _ = sess.run([prediction, cross_entropy, cal_accuracy, train_step],
                                            feed_dict={x_rnn:batch_data_vec, 
                                                       x_naive_bayes:naive_bayes_input,
                                                       y_:batch_label, 
                                                       x_length:data_length})
         
            avg_accuracy += accuracy
            avg_loss += loss
            i += 1
            #print(".",end="")
            if i%1000 == 0:
                print("-----------%d-----------"%i)
                print("Accuracy at %d: %.2f%%"%(i,avg_accuracy/10))
                print("Loss at %d：%.4f"%(i, avg_loss/1000))
                saver.save(sess,ckpt_dir+'news_category',global_step=i)
                log.add_log('train_accuracy',i, avg_accuracy/1000)
                log.add_log('train_loss',i, avg_loss/1000)
                log.SaveToFile() 
                avg_accuracy = 0
                avg_loss = 0

    except tf.errors.OutOfRangeError:
        print('done')
    finally:
        coord.request_stop()
        coord.join(threads)
        print('quit')

-----------1000-----------
Accuracy at 1000: 5.08%
Loss at 1000：3.5757
-----------2000-----------
Accuracy at 2000: 14.90%
Loss at 2000：3.3629
-----------3000-----------
Accuracy at 3000: 29.98%
Loss at 3000：3.1922
-----------4000-----------
Accuracy at 4000: 34.29%
Loss at 4000：3.0503
-----------5000-----------
Accuracy at 5000: 35.05%
Loss at 5000：2.9072
-----------6000-----------
Accuracy at 6000: 35.72%
Loss at 6000：2.7801
-----------7000-----------
Accuracy at 7000: 36.66%
Loss at 7000：2.6770
-----------8000-----------
Accuracy at 8000: 38.65%
Loss at 8000：2.5599
-----------9000-----------
Accuracy at 9000: 40.29%
Loss at 9000：2.4629
-----------10000-----------
Accuracy at 10000: 41.80%
Loss at 10000：2.3851
-----------11000-----------
Accuracy at 11000: 43.78%
Loss at 11000：2.2914
-----------12000-----------
Accuracy at 12000: 45.89%
Loss at 12000：2.2087
-----------13000-----------
Accuracy at 13000: 47.29%
Loss at 13000：2.1479
-----------14000-----------
Accuracy at 14000: 49.94%

In [5]:
with tf.Session() as sess:
    model_file=tf.train.latest_checkpoint(ckpt_dir)
    saver.restore(sess,model_file)
    total_accuracy = 0
    total_loss = 0
    #print(int(data.test_size/data.batch_size)+1)
    for i in range(int(data.test_size/data.batch_size)+1):
        batch_data, batch_label = data.batch_test_set()  
        batch_data_vec, data_length = ncd.word2vector(batch_data, model,max_length)
        naive_bayes_input = rp.naive_bayes_model(batch_data,word_matrix, word_list)
        y, loss, accuracy = sess.run([prediction, cross_entropy, cal_accuracy],
                                            feed_dict={x_rnn:batch_data_vec, 
                                                       x_naive_bayes:naive_bayes_input,
                                                       y_:batch_label, 
                                                       x_length:data_length})
        
        total_accuracy += accuracy
        total_loss += loss
        #print(i,accuracy,loss)
        
    total_accuracy = total_accuracy/i
    total_loss = total_loss/i
    print("Test set accuracy: %.2f %%"%(total_accuracy*100))
    print("Test set loss: %.2f "%(total_loss))
    

Test set accuracy: 51.31 %
Test set loss: 1.98 
