# A Combined end-to-end training model for NLP classify problem

In [11]:
import numpy as np
import random
import threading
import warnings
import os
import sys
import nltk
sys.path.append('../../common/')

from train_log import train_log

os.environ["CUDA_VISIBLE_DEVICES"]="1,2" 
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.FATAL)

FILE_NAME = 'News_Category_Dataset_v2.json'

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/home/ubuntu/Notebooks/GloVe/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
    
import NewsCategoryData as ncd 
from NewsCategoryData import NewsCategoryTrainTestSet
from NewsCategoryData import LABEL_LIST
import Run_Prediction as rp

CONTINUE = 1
start_step = 200000

In [2]:
batch_size = 64
state_size = 64
max_length = 100
n_classes = 41
word2vector_len = 100  
data = NewsCategoryTrainTestSet(batch_size=batch_size,max_length=max_length)
word_matrix_file = "naive_bayes_word_matrix_ver1.csv"
word_matrix, word_list = rp.read_naive_bayes_word_vector(word_matrix_file)

ckpt_dir = './ckpt_save_combine_model/'
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)
log_dir = './log_combine_model/'
log = train_log(log_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

Total 200847 recorders are read. 180787 train data and 20060 test data.
Load the naive bayes word vectors. It takes a few minutes.


In [3]:
def find_window(word, title, window_size = 2):
    '''
    word: is the word you are looking for.
    title: is a word list which contains the given word
    window_size: decide how much word you are going to return, 
    for example 2 means only return the left and right words, 
    3 means return the left 2 and right 2 words. 

    '''
    close_word_list = []
    i = 0 
    for i in range(len(title)):
        if title[i].lower() == word:
            if i > 0:
                close_word_list.extend(title[max(i-window_size+1,0):i])
            if i < len(title)-1:
                close_word_list.extend(title[i+1:min(i+window_size,len(title))])
    return close_word_list

unknown_word = {}
for title in data.train_data:
    for word in title:
        if model.vocab.get(word.lower(),"NaN") == "NaN":
            if word.lower() in unknown_word:
                unknown_word[word.lower()].append(title)
            else:
                unknown_word[word.lower()] = []
                unknown_word[word.lower()].append(title)

close_word_dict = {}
for word in unknown_word:
    close_word_dict[word] = []
    for title in unknown_word[word]:
        close_word_list = find_window(word, title, window_size = 3)
        close_word_dict[word].extend(close_word_list)
      
    
word_vector_dic = {}
for word in close_word_dict:
    word_vector_dic[word] =np.zeros(100)
    for close_word in close_word_dict[word]:
        if model.vocab.get(close_word.lower(),"NaN") != "NaN":
            word_vector_dic[word] = np.add(word_vector_dic[word],model[close_word.lower()])/2

## Train the out-of-vocabury words with the close words in the dataset


In [4]:
def getvector(word, model):
    if model.vocab.get(word.lower(),"NaN") == "NaN":
        lst = nltk.stem.LancasterStemmer()
        if model.vocab.get(lst.stem(word).lower(),"NaN") == "NaN":
            if word[-1] == "s" and model.vocab.get(word[:-1].lower(),"NaN") != "NaN":
                return model[word[:-1].lower()]
            elif word.lower() in word_vector_dic:
                return word_vector_dic[word.lower()]
            else:
                return model["unk"]
        else:
            return model[lst.stem(word).lower()]
    else:
        return model[word.lower()]

def word2vector(reviews, model, max_length=1000):
    vector_data = np.zeros((len(reviews), max_length,100))
    x_length = []
    i = 0
    for review in reviews:
        j = 0 
        if len(review) > max_length:
            print("The length of the reviews is %dwhich is larger than max_length (%d)"%(len(review),max_length))
            print(review)
            print("-"*10)
        for word in review:
            vector_data[i,j] = getvector(word,model)
            if str(vector_data[i,j,0])=='nan':
                print(word)
                break
            j += 1
        x_length.append(j)
        i += 1
    return vector_data, np.asarray(x_length)

In [5]:
graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope('input'):
        x_rnn = tf.placeholder(tf.float32, [None, max_length, word2vector_len], name="x_rnn")
        x_naive_bayes = tf.placeholder(tf.float32, [None, n_classes], name="x_naive_bayes")
        x_length = tf.placeholder(tf.int32, [None], name="x_length")
        y_ = tf.placeholder(tf.int64, [None],  name="y_")

        #RNN的初始化状态，全设为零。注意state是与input保持一致，接下来会有concat操作，所以这里要有batch的维度。即每个样本都要有隐层状态
        init_state = tf.zeros([batch_size, state_size], name="init_state")

    with tf.variable_scope('rnn'):
        #定义rnn_cell的权重参数，
        #cell = tf.contrib.rnn.BasicRNNCell(state_size, name = "cell")
        cell = tf.contrib.rnn.BasicLSTMCell(state_size, forget_bias = 0, name = "cell")
        #rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_stateu
        rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, inputs = x_rnn, 
                                                     dtype=tf.float32,
                                                     #initial_state=init_state,
                                                     sequence_length = x_length)
        print("rnn/final_state", final_state)
        #print("rnn/rnn_outputs", rnn_outputs)
        #last = rnn_outputs[:, -1, :]
        #print("rnn/last",last.shape)

    with tf.variable_scope('fclayer'):
        w1 = tf.Variable(tf.random_uniform([state_size*2, int(state_size*9)],0,1.0),name="w1")
        b1= tf.Variable(tf.zeros([int(state_size*9)]),name="b1")
        hidden = tf.sigmoid(tf.add(tf.matmul(tf.concat([final_state[0],final_state[1]],axis=1), w1), b1),name="hidden")
        print("fclayer/hidden", hidden.shape)

    with tf.variable_scope('rnn_output'):
        w2 = tf.Variable(tf.random_uniform([int(state_size*9), n_classes],0,1.0),name="w2")
        b2= tf.Variable(tf.zeros([n_classes]),name="b2")
        y_rnn_logits = tf.add(tf.matmul(hidden, w2), b2,name="y_rnn_logits")
        y_rnn = tf.nn.softmax(y_rnn_logits, name="y_rnn")
        print("rnn_output/y_rnn", y_rnn.shape)

    with tf.variable_scope('combine_output'):   
        w = tf.Variable(tf.random_uniform([n_classes*2, n_classes],0,1.0),name="w")
        b= tf.Variable(tf.zeros([n_classes]),name="b")
        y = tf.add(tf.matmul(tf.concat([y_rnn,x_naive_bayes],axis=1), w), b,name="y")
        print("combine_output/y",y.shape)

    with tf.variable_scope('loss'):
        cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,
                                                                                      labels=y_, 
                                                                                      name="cross_entropy_loss"))
        #cross_entropy_3 = tf.cast(tf.one_hot(y_-1,n_classes),tf.float32)*tf.log(tf.nn.softmax(y))
    with tf.variable_scope('Prediction'):
        prediction = tf.argmax(tf.nn.softmax(y),1, name="prediction")
        cal_accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction,y_), tf.float32),name="cal_accuracy")

    with tf.variable_scope('Train'):
        #train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
        train_step = tf.train.AdamOptimizer(0.00000001).minimize(cross_entropy)

rnn/final_state LSTMStateTuple(c=<tf.Tensor 'rnn/rnn/while/Exit_3:0' shape=(?, 64) dtype=float32>, h=<tf.Tensor 'rnn/rnn/while/Exit_4:0' shape=(?, 64) dtype=float32>)
fclayer/hidden (?, 576)
rnn_output/y_rnn (?, 41)
combine_output/y (?, 41)


In [12]:
with graph.as_default():
    saver = tf.train.Saver(max_to_keep = 1)
    max_output_count = 0

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        if CONTINUE != 0:
            model_file=tf.train.latest_checkpoint(ckpt_dir)
            saver.restore(sess,model_file)
        threads = tf.train.start_queue_runners(sess, coord)
        try:
            i = start_step
            avg_accuracy = 0
            avg_loss = 0
            j = 0
            while i < 220000:    
                batch_data, batch_label = data.batch_train_set() 
                batch_data_vec, data_length = word2vector(batch_data, model,max_length)
                naive_bayes_input = rp.naive_bayes_model(batch_data,word_matrix, word_list)
                rnn_output_, final_state_, y, loss, accuracy, _ = sess.run([rnn_outputs, final_state, prediction, cross_entropy, cal_accuracy, train_step],
                                                feed_dict={x_rnn:batch_data_vec, 
                                                           x_naive_bayes:naive_bayes_input,
                                                           y_:batch_label, 
                                                           x_length:data_length})

                #print()
                #print(rnn_output_)
                #print(final_state_)
                #print()
                avg_accuracy += accuracy
                avg_loss += loss
                i += 1
                #print(".",end="")
                if i%1000 == 0:
                    print("-----------%d-----------"%i)
                    print("Accuracy at %d: %.2f%%"%(i,avg_accuracy/10))
                    print("Loss at %d：%.4f"%(i, avg_loss/1000))
                    saver.save(sess,ckpt_dir+'news_category',global_step=i)
                    log.add_log('train_accuracy',i, avg_accuracy/1000)
                    log.add_log('train_loss',i, avg_loss/1000)
                    log.SaveToFile() 
                    avg_accuracy = 0
                    avg_loss = 0

        except tf.errors.OutOfRangeError:
            print('done')
        finally:
            coord.request_stop()
            coord.join(threads)
            print('quit')

-----------201000-----------
Accuracy at 201000: 76.69%
Loss at 201000：0.9161
-----------202000-----------
Accuracy at 202000: 77.14%
Loss at 202000：0.8979
-----------203000-----------
Accuracy at 203000: 77.00%
Loss at 203000：0.9003
-----------204000-----------
Accuracy at 204000: 76.61%
Loss at 204000：0.9204
-----------205000-----------
Accuracy at 205000: 77.29%
Loss at 205000：0.8905
-----------206000-----------
Accuracy at 206000: 76.89%
Loss at 206000：0.9046
-----------207000-----------
Accuracy at 207000: 76.72%
Loss at 207000：0.9142
-----------208000-----------
Accuracy at 208000: 77.29%
Loss at 208000：0.8912
-----------209000-----------
Accuracy at 209000: 76.82%
Loss at 209000：0.9068
-----------210000-----------
Accuracy at 210000: 76.84%
Loss at 210000：0.9103
-----------211000-----------
Accuracy at 211000: 77.25%
Loss at 211000：0.8919
-----------212000-----------
Accuracy at 212000: 76.86%
Loss at 212000：0.9089
-----------213000-----------
Accuracy at 213000: 76.82%
Loss at 

## Run the model for the test dataset
Output the accuracy and the loss of the test dataset

In [13]:
with graph.as_default():
    with tf.Session() as sess:
        model_file=tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess,model_file)
        total_accuracy = 0
        total_loss = 0
        #print(int(data.test_size/data.batch_size)+1)
        for i in range(int(data.test_size/data.batch_size)+1):
            batch_data, batch_label = data.batch_test_set()
            batch_data_vec, data_length = word2vector(batch_data, model,max_length)
            naive_bayes_input = rp.naive_bayes_model(batch_data,word_matrix, word_list)
            y, loss, accuracy = sess.run([prediction, cross_entropy, cal_accuracy],
                                                feed_dict={x_rnn:batch_data_vec, 
                                                           x_naive_bayes:naive_bayes_input,
                                                           y_:batch_label, 
                                                           x_length:data_length})

            total_accuracy += accuracy
            total_loss += loss
            #print(i,accuracy,loss)

        total_accuracy = total_accuracy/i
        total_loss = total_loss/i
        print("Test set accuracy: %.2f %%"%(total_accuracy*100))
        print("Test set loss: %.2f "%(total_loss))
    

Test set accuracy: 51.78 %
Test set loss: 1.96 


## Run the model for the train dataset
Output the accuracy and the loss based on the whole train dataset

In [None]:
with graph.as_default():
    with tf.Session() as sess:
        model_file=tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess,model_file)
        total_accuracy = 0
        total_loss = 0
        #print(int(data.test_size/data.batch_size)+1)
        for i in range(int(data.train_size/data.batch_size)+1):
            batch_data, batch_label = data.batch_train_set()  
            batch_data_vec, data_length = word2vector(batch_data, model,max_length)
            naive_bayes_input = rp.naive_bayes_model(batch_data,word_matrix, word_list)
            y, loss, accuracy = sess.run([prediction, cross_entropy, cal_accuracy],
                                                feed_dict={x_rnn:batch_data_vec, 
                                                           x_naive_bayes:naive_bayes_input,
                                                           y_:batch_label, 
                                                           x_length:data_length})

            total_accuracy += accuracy
            total_loss += loss
            #print(i,accuracy,loss)

        total_accuracy = total_accuracy/i
        total_loss = total_loss/i
        print("Train set accuracy: %.2f %%"%(total_accuracy*100))
        print("Train set loss: %.2f "%(total_loss))
    

## Train my own word vectors

In [None]:
    
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    # ------------------
    # Write your implementation here.
    flat_corpus = [y for x in corpus for y in x]
    for x in flat_corpus: 
        if x not in corpus_words: 
            corpus_words.append(x)
    corpus_words.sort()
    num_corpus_words = len(corpus_words)
    #print(corpus_words)
    # ------------------

    return corpus_words, num_corpus_words

def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    print('Total %d distinct words in the corpus.'%num_words)
    M = None
    word2Ind = {}
    
    # ------------------
    # Write your implementation here.
    M = [[0 for i in range(num_words)] for j in range (num_words)]
    index = 0
    for w in words:
        word2Ind[w]=index
        index += 1
    for s in corpus:
        pos = 0
        for c in s:
            substr = []
            for i in range(1,window_size+1):                
                if pos-i >= 0 and s[pos-i] not in substr:
                    substr.append(s[pos-i])
                if pos+i <len(s) and s[pos+i] not in substr:           
                    substr.append(s[pos+i])
            for n in substr:
                M[word2Ind[c]][word2Ind[n]] += 1
            pos += 1
    # ------------------
    print("M shape", len(M), len(M[0]))
    M = np.array(M)
    return M, word2Ind

START_TOKEN = '<START>'
END_TOKEN = '<END>'
corpus = []

def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
    # ------------------
    # Write your implementation here.
    svd = TruncatedSVD(n_components=k)
    M_reduced=svd.fit_transform(M)
    # ------------------

    print("Done.")
    return M_reduced

for title in data.train_data:
    corpus.append([START_TOKEN] + title + [END_TOKEN])
matrix, word2ind = compute_co_occurrence_matrix(corpus, window_size=2)
print(matrix[0])
print(len(word2ind))