In [2]:
import numpy as np
import random
import threading
import warnings
import os
import sys
import nltk
sys.path.append('../../common/')

from train_log import train_log

os.environ["CUDA_VISIBLE_DEVICES"]="1,2" 
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.FATAL)

FILE_NAME = 'News_Category_Dataset_v2.json'

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/home/ubuntu/Notebooks/GloVe/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

ckpt_dir = './ckpt_save_cnn/'
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)
log_dir = './log_cnn/'
log = train_log(log_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

CONTINUE = 1
start_step = 150000

In [3]:
def getvector(word, model=model):
    if model.vocab.get(word.lower(),"NaN") == "NaN":
        lst = nltk.stem.LancasterStemmer()
        if model.vocab.get(lst.stem(word).lower(),"NaN") == "NaN":
            if word[-1] == "s" and model.vocab.get(word[:-1].lower(),"NaN") != "NaN":
                return model[word[:-1].lower()]
            else:
                return model["unk"]
        else:
            return model[lst.stem(word).lower()]
    else:
        return model[word.lower()]

def word2vector(reviews, model=model, max_length=1000):
    vector_data = np.zeros((len(reviews), max_length,100))
    x_length = []
    i = 0
    for review in reviews:
        j = 0 
        if len(review) > max_length:
            print("The length of the reviews is %dwhich is larger than max_length (%d)"%(len(review),max_length))
            print(review)
            print("-"*10)
        for word in review:
            vector_data[i,j] = getvector(word,model)
            if str(vector_data[i,j,0])=='nan':
                print(word)
                break
            j += 1
        x_length.append(j)
        i += 1
    return vector_data, np.asarray(x_length)


In [11]:
from NewsCategoryData import NewsCategory

'''
data = NewsCategory(batch_size=128)

for i in range(10):
    train_data, train_label = data.get_batch_data()
    batch_data, data_length = word2vector(train_data, model)
    #print(len(train_data),len(train_label))
    #print("-"*20)
    #print(train_data, train_label)
  
max_len = 0
for text in data.data:
    if len(text) > max_len:
        max_len = len(text)
print(max_len)
'''
def getvector2(word, model=model):
    if model.vocab.get(word.lower(),"NaN") == "NaN":
        lst = nltk.stem.LancasterStemmer()
        if model.vocab.get(lst.stem(word).lower(),"NaN") == "NaN":
            if word[-1] == "s" and model.vocab.get(word[:-1].lower(),"NaN") != "NaN":
                #print(word[:-1])
                return word[:-1].lower()
            else:
                return "unk"
        else:
            return lst.stem(word).lower()
    else:
        return word.lower()
#print(getvector2("2018s",model))
'''
UNK_DIC = []
KNW_DIC = []
count = 0
for item in data.data:
    for word in item:
        word_vec = getvector2(word, model)
        if word_vec == "unk":
            if word not in UNK_DIC:
                UNK_DIC.append(word)
        else:
            if word not in KNW_DIC:
                KNW_DIC.append(word)
        #print(word_vec,end=" ")
    #print()
    count += 1

print("Unknown words %d in total %d words."%(len(UNK_DIC),len(KNW_DIC)))

for word in UNK_DIC:
    print(word,end=", ")'''


'\nUNK_DIC = []\nKNW_DIC = []\ncount = 0\nfor item in data.data:\n    for word in item:\n        word_vec = getvector2(word, model)\n        if word_vec == "unk":\n            if word not in UNK_DIC:\n                UNK_DIC.append(word)\n        else:\n            if word not in KNW_DIC:\n                KNW_DIC.append(word)\n        #print(word_vec,end=" ")\n    #print()\n    count += 1\n\nprint("Unknown words %d in total %d words."%(len(UNK_DIC),len(KNW_DIC)))\n\nfor word in UNK_DIC:\n    print(word,end=", ")'

In [6]:
batch_size = 64
state_size = 64
max_length = 100
n_classes = 41
word2vector_len = 100

with tf.variable_scope('Input'):
    x = tf.placeholder(tf.float32, [None, max_length, word2vector_len], name="x")
    x_length = tf.placeholder(tf.int32, [None], name="x_length")
    y_ = tf.placeholder(tf.int64, [None],  name="y_")
    
    #RNN的初始化状态，全设为零。注意state是与input保持一致，接下来会有concat操作，所以这里要有batch的维度。即每个样本都要有隐层状态
    init_state = tf.zeros([batch_size, state_size], name="init_state")

with tf.variable_scope('RNN'):
    #定义rnn_cell的权重参数，
    cell = tf.contrib.rnn.BasicRNNCell(state_size, name = "cell")
    #rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_stateu
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, inputs = x, 
                                                 dtype=tf.float32,
                                                 #initial_state=init_state,
                                                 sequence_length = x_length)
with tf.variable_scope('fclayer'):
    w1 = tf.Variable(tf.random_uniform([state_size, int(state_size*9)],0,1.0),name="w1")
    b1= tf.Variable(tf.zeros([int(state_size*9)]),name="b1")
    hidden = tf.sigmoid(tf.add(tf.matmul(final_state, w1), b1),name="hidden")
    #print(y.shape)

with tf.variable_scope('Output'):
    w2 = tf.Variable(tf.random_uniform([int(state_size*9), n_classes],0,1.0),name="w2")
    b2= tf.Variable(tf.zeros([n_classes]),name="b2")
    y = tf.add(tf.matmul(hidden, w2), b2,name="y")
    
with tf.variable_scope('Loss'):
    cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,
                                                                                  labels=y_, 
                                                                                  name="cross_entropy_loss"))
    #cross_entropy_3 = tf.cast(tf.one_hot(y_-1,n_classes),tf.float32)*tf.log(tf.nn.softmax(y))
with tf.variable_scope('Prediction'):
    prediction = tf.argmax(tf.nn.softmax(y),1, name="prediction")
    cal_accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction,y_), tf.float32),name="cal_accuracy")
    
with tf.variable_scope('Train'):
    #train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    train_step = tf.train.AdamOptimizer(0.000000001).minimize(cross_entropy)
    



In [5]:
print(rnn_outputs.shape)
print(final_state.shape)
print(y.shape)

(?, 100, 64)
(?, 64)
(?, 41)


In [9]:
saver = tf.train.Saver(max_to_keep = 1)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    if CONTINUE != 0:
        model_file=tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess,model_file)
    threads = tf.train.start_queue_runners(sess, coord)
    try:
        data = NewsCategory(batch_size=batch_size,max_length=max_length)
        i = start_step
        avg_accuracy = 0
        avg_loss = 0
        while i < 200000:    
            batch_data, batch_label = data.get_batch_data()  
            batch_data, data_length = word2vector(batch_data, model,max_length)
            rnn_state, state, pred_value, y_value, loss, accuracy, _ = sess.run([rnn_outputs, final_state, y, prediction, cross_entropy, cal_accuracy, train_step],
                                            feed_dict={x:batch_data, y_:batch_label, x_length:data_length})
            '''
            print("-----------------------------")
            print("batch_data")
            print(batch_data)
            print("rnn_state")
            print(rnn_state)
            print("state")
            print(state)
            print("y_value")
            print(y_value)
            print("batch_label")
            print(batch_label)
            print("pred_value")
            print(pred_value)
            '''
            avg_accuracy += accuracy
            avg_loss += loss
            i += 1
            #print(".",end="")
            if i%1000 == 0:
                print("-----------%d-----------"%i)
                print("Accuracy at %d: %.2f%%"%(i,avg_accuracy/10))
                print("Loss at %d：%.4f"%(i, avg_loss/1000))
                saver.save(sess,ckpt_dir+'news_category',global_step=i)
                log.add_log('train_accuracy',i, avg_accuracy/1000)
                log.add_log('train_loss',i, avg_loss/1000)
                log.SaveToFile() 
                avg_accuracy = 0
                avg_loss = 0

    except tf.errors.OutOfRangeError:
        print('done')
    finally:
        coord.request_stop()
        coord.join(threads)
        print('quit')
        

ValueError: At least two variables have the same name: Output/b2

In [8]:
batch_size = 16
#max_length = 500
with tf.Session() as sess:
    graph = tf.get_default_graph()
    saver = tf.train.import_meta_graph(ckpt_dir+'news_category-100000.meta')
    model_file=tf.train.latest_checkpoint(ckpt_dir)
    saver.restore(sess,model_file)
    
    x = graph.get_tensor_by_name("Input/x:0")
    y_ = graph.get_tensor_by_name("Input/y_:0")
    x_length = graph.get_tensor_by_name("Input/x_length:0")
    y = graph.get_tensor_by_name("Output/y:0")
    prediction = graph.get_tensor_by_name("Prediction/prediction:0")
    #cal_accuracy = graph.get_tensor_by_name("Prediction/cal_accuracy:0")
    data = NewsCategory(batch_size=batch_size,max_length=max_length)
    i = 0
    avg_accuracy = 0
    while i < 10:
        batch_data, batch_label = data.get_batch_data()  
        batch_data_vec, data_length = word2vector(batch_data, model,max_length)
        logits, pred_cate = sess.run([y, prediction],feed_dict={x:batch_data_vec, y_:batch_label, x_length:data_length})
        for j in range(batch_size):
            #print("logits:",logits[j])
            if batch_label[j] != pred_cate[j]:
                #print("batch data:",batch_data[j])
                print("label, predicton: ",batch_label[j], pred_cate[j])
        #avg_accuracy += accuracy[0]
        i += 1
        

label, predicton:  25 31
label, predicton:  26 2
label, predicton:  20 32
label, predicton:  10 4
label, predicton:  24 37
label, predicton:  10 1
label, predicton:  21 19
label, predicton:  14 32
label, predicton:  38 4
label, predicton:  13 4
label, predicton:  8 4
label, predicton:  35 8
label, predicton:  0 4
label, predicton:  25 31
label, predicton:  32 1
label, predicton:  3 31
label, predicton:  25 31
label, predicton:  6 9
label, predicton:  21 16
label, predicton:  9 1
label, predicton:  1 16
label, predicton:  4 1
label, predicton:  12 33
label, predicton:  11 31
label, predicton:  7 31
label, predicton:  2 4
label, predicton:  0 4
label, predicton:  2 4
label, predicton:  36 34
label, predicton:  9 10
label, predicton:  33 31
label, predicton:  4 12
label, predicton:  4 0
label, predicton:  32 13
label, predicton:  4 8
label, predicton:  11 31
label, predicton:  36 37
label, predicton:  13 4
label, predicton:  24 12
label, predicton:  1 0
label, predicton:  3 5
label, predi