# Journalist Classification with TensorFlow

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


import the dataset

In [2]:
articles = pd.read_csv("articles.csv")

Split into X and y

In [10]:
y = articles["author"]
X = articles["article"]

Define function next_batch (obtained stack_overflow)

In [11]:
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[i] for i in idx]
    labels_shuffle = [labels[i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)


Now, we've had all ingredients to build a logistic or softmax classification. Let's build a simple softmax classification.

In [12]:
# Importing necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# 80-20 splitting the dataset (80%->Training and 20%->Validation)

X_train, X_test, y_train, y_test = train_test_split(X, y
                                   ,test_size=0.2, random_state=1234)
y_train = np.array(y_train)
y_test = np.array(y_test)

# defining the bag-of-words transformer on the text-processed corpus # i.e., text_process() declared in II is executed...
bow_transformer=CountVectorizer(max_features = 2500).fit(X_train)
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_train=bow_transformer.transform(X_train).toarray()#ONLY TRAINING DATA

# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_test=bow_transformer.transform(X_test).toarray()#TEST DATA

In [13]:
tf.reset_default_graph()

n_inputs = 1 * 2500
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 58
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=[None, n_inputs], name='X')
y = tf.placeholder(tf.int64, shape=[None], name='y')

W = tf.Variable(tf.truncated_normal([n_inputs, n_outputs], stddev=0.02), name='weights')
b = tf.Variable(tf.zeros([n_outputs]), name='biases')

logits = tf.add(tf.matmul(X, W), b, name='logits')

with tf.name_scope('evaluation'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name='xentropy')
    loss = tf.reduce_mean(xentropy, name='loss')
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

with tf.name_scope("train"):
    grad_W, grad_b = tf.gradients(loss, [W, b])
    update_W = tf.assign(W, W - learning_rate * grad_W)
    update_b = tf.assign(b, b - learning_rate * grad_b)

init = tf.global_variables_initializer()

In [14]:
n_epochs = 200
batch_size = 100

with tf.Session() as sess:
    init.run()
    print("Epoch\tTrain accuracy\tTest accuracy")
    for epoch in range(n_epochs):
        for iteration in range(len(text_bow_train) // batch_size):
            X_batch, y_batch = next_batch(batch_size,text_bow_train,y_train)
            sess.run([update_W, update_b], feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: text_bow_test, y: y_test})
        print('%d\t%f\t%f' % (epoch, acc_train, acc_test))
    classification = tf.run(y, feed_dict)
    print(classification)
        

Epoch	Train accuracy	Test accuracy
0	0.230000	0.217593
1	0.320000	0.268519
2	0.410000	0.310185
3	0.420000	0.337963
4	0.530000	0.365741
5	0.520000	0.388889
6	0.520000	0.398148
7	0.480000	0.421296
8	0.600000	0.421296
9	0.550000	0.444444
10	0.660000	0.449074
11	0.640000	0.467593
12	0.680000	0.481481
13	0.650000	0.495370
14	0.710000	0.500000
15	0.670000	0.504630
16	0.790000	0.509259
17	0.790000	0.513889
18	0.740000	0.537037
19	0.800000	0.541667
20	0.820000	0.550926
21	0.750000	0.550926
22	0.790000	0.555556
23	0.800000	0.564815
24	0.850000	0.574074
25	0.810000	0.569444
26	0.790000	0.587963
27	0.780000	0.583333
28	0.840000	0.583333
29	0.820000	0.587963
30	0.830000	0.597222
31	0.880000	0.597222
32	0.820000	0.601852
33	0.860000	0.597222
34	0.900000	0.597222
35	0.870000	0.606481
36	0.840000	0.615741
37	0.820000	0.620370
38	0.890000	0.615741
39	0.900000	0.615741
40	0.880000	0.634259
41	0.890000	0.634259
42	0.870000	0.634259
43	0.910000	0.638889
44	0.910000	0.643519
45	0.870000	0.634259
46	0.9300

AttributeError: module 'tensorflow' has no attribute 'run'

The accuracy was not bad, roughly 64%. Compared to the 49% from the Naive Bayes, this is a tremendeous increase. However, Now let's try it with tensorflow

In [21]:
import tensorflow as tf
tf.reset_default_graph()

n_inputs = 1 * 5000
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 58
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=[None, n_inputs], name='X')
y = tf.placeholder(tf.int64, shape=[None], name='y')

W = tf.Variable(tf.truncated_normal([n_inputs, n_outputs], stddev=0.02), name='weights')
b = tf.Variable(tf.zeros([n_outputs]), name='biases')

logits = tf.add(tf.matmul(X, W), b, name='logits')

with tf.name_scope('evaluation'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name='xentropy')
    loss = tf.reduce_mean(xentropy, name='loss')
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

In [22]:
n_epochs = 200
batch_size = 50

with tf.Session() as sess:
    init.run()
    print("Epoch\tTrain accuracy\tTest accuracy")
    for epoch in range(n_epochs):
        for iteration in range(len(text_bow_train) // batch_size):
            X_batch, y_batch = next_batch(batch_size,text_bow_train,y_train)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch}) 
        acc_test = accuracy.eval(feed_dict={X: text_bow_test, y: y_test})
        if epoch % 10 == 0:
            print("{}\t{}\t{}".format(epoch, acc_train, acc_test))   
        

Epoch	Train accuracy	Test accuracy
0	0.420000	0.273148
1	0.480000	0.310185
2	0.560000	0.351852
3	0.620000	0.416667
4	0.720000	0.444444
5	0.560000	0.458333
6	0.720000	0.472222
7	0.840000	0.472222
8	0.900000	0.490741
9	0.780000	0.504630
10	0.820000	0.523148
11	0.900000	0.523148
12	0.940000	0.541667
13	0.900000	0.541667
14	0.920000	0.537037
15	0.960000	0.564815
16	0.940000	0.569444
17	0.920000	0.555556
18	0.960000	0.560185
19	0.820000	0.583333
20	0.900000	0.583333
21	0.940000	0.587963
22	0.880000	0.574074
23	0.940000	0.597222
24	0.860000	0.601852
25	0.940000	0.587963
26	0.940000	0.597222
27	0.980000	0.601852
28	0.960000	0.601852
29	0.940000	0.620370
30	0.940000	0.620370
31	0.960000	0.620370
32	0.980000	0.620370
33	0.960000	0.620370
34	0.900000	0.625000
35	0.900000	0.615741
36	0.940000	0.629630
37	0.960000	0.634259
38	0.980000	0.629630
39	0.940000	0.634259
40	0.920000	0.629630
41	0.940000	0.643519
42	0.980000	0.652778
43	0.940000	0.634259
44	0.960000	0.648148
45	1.000000	0.643519
46	1.0000

Another Optimizer:

In [17]:
import tensorflow as tf
tf.reset_default_graph()

n_inputs = 1 * 2500
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 58
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=[None, n_inputs], name='X')
y = tf.placeholder(tf.int64, shape=[None], name='y')

W = tf.Variable(tf.truncated_normal([n_inputs, n_outputs], stddev=0.02), name='weights')
b = tf.Variable(tf.zeros([n_outputs]), name='biases')

logits = tf.add(tf.matmul(X, W), b, name='logits')

with tf.name_scope('evaluation'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name='xentropy')
    loss = tf.reduce_mean(xentropy, name='loss')
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

Train the model

In [22]:
n_epochs = 50
batch_size = 50

with tf.Session() as sess:
    init.run()
    print("Epoch\tTrain accuracy\tTest accuracy")
    for epoch in range(n_epochs):
        for iteration in range(len(text_bow_train) // batch_size):
            X_batch, y_batch = next_batch(batch_size,text_bow_train,y_train)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: text_bow_test, y: y_test})
        if epoch % 10 == 0:
            print("{}\t{}\t{}".format(epoch, acc_train, acc_test))   
#     confusion = tf.confusion_matrix(labels=y_, predictions=y, num_classes=num_classes)
#     print(confusion)
    

Epoch	Train accuracy	Test accuracy
0	0.8600000143051147	0.5416666865348816
10	1.0	0.6574074029922485
20	1.0	0.6481481194496155
30	1.0	0.6435185074806213
40	1.0	0.6527777910232544


So with the Gradient Descent Models we get somewhere aroound 65%. As Next, I will check how the accuracy can be improved by a Deep Forward Neural Network








# Deep Forward Neural Network
It will be done with the standard settings

In [26]:
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

num_inputs = 1 * 2500
num_hidden1 = 300
num_hidden2 = 100
num_outputs = 58
learning_rate = 0.01

x = tf.placeholder(tf.float32, shape=(None, num_inputs), name="x")
y = tf.placeholder(tf.int64, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(x, num_hidden1, name="hidden1", activation=tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, num_hidden2, name="hidden2", activation=tf.nn.relu)
    hidden3 = tf.layers.dense(hidden1, num_hidden2, name="hidden3", activation=tf.nn.relu)
    logits = tf.layers.dense(hidden3, num_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads = optimizer.compute_gradients(loss)    
    training_op = optimizer.apply_gradients(grads)
    
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)
        
    for grad, var in grads:
        if grad is not None:
            tf.summary.histogram(var.op.name + "/gradients", grad)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    


# summary
accuracy_summary = tf.summary.scalar('accuracy', accuracy)

# merge all summary
tf.summary.histogram('hidden1/activations', hidden1)
tf.summary.histogram('hidden2/activations', hidden2)

merged = tf.summary.merge_all()

init = tf.global_variables_initializer()
saver = tf.train.Saver()

from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs/example03/dnn_final"
logdir = "{}/run-{}/".format(root_logdir, now)

train_writer = tf.summary.FileWriter(logdir + 'train', tf.get_default_graph())
test_writer = tf.summary.FileWriter(logdir + 'test', tf.get_default_graph())

In [27]:
num_epochs = 200
batch_size = 100

with tf.Session() as sess:
    init.run()
    print("Epoch\tTrain accuracy\tTest accuracy")
    for epoch in range(num_epochs):
        for iteration in range(len(text_bow_train) // batch_size):
            x_batch, y_batch = next_batch(batch_size,text_bow_train,y_train)
            sess.run(training_op, feed_dict={x: x_batch, y: y_batch})
            
        train_summary, acc_train = sess.run([merged, accuracy],
                                             feed_dict={x: x_batch, y: y_batch})
        
        test_summary, acc_test = sess.run([accuracy_summary, accuracy],
                                          feed_dict={x: text_bow_test, y: y_test})
        
        train_writer.add_summary(train_summary, epoch)
        test_writer.add_summary(test_summary, epoch)
        if epoch % 10 == 0:
            print("{}\t{}\t{}".format(epoch, acc_train, acc_test))   
        
    save_path = saver.save(sess, "models/example03/dnn_final.ckpt")
    
train_writer.close()
test_writer.close()

Epoch	Train accuracy	Test accuracy
0	0.10999999940395355	0.06481481343507767
10	0.5	0.33796295523643494
20	0.6899999976158142	0.4166666567325592
30	0.8199999928474426	0.5138888955116272
40	0.9100000262260437	0.5694444179534912
50	0.949999988079071	0.5648148059844971
60	0.949999988079071	0.5694444179534912
70	0.9700000286102295	0.5833333134651184
80	0.9800000190734863	0.5925925970077515
90	0.9700000286102295	0.6064814925193787
100	0.9599999785423279	0.6111111044883728
110	0.9900000095367432	0.6157407164573669
120	0.9700000286102295	0.6342592835426331
130	0.9900000095367432	0.6388888955116272
140	0.9900000095367432	0.625
150	1.0	0.6296296119689941
160	1.0	0.6296296119689941
170	0.9900000095367432	0.6342592835426331
180	0.9900000095367432	0.6342592835426331
190	1.0	0.6296296119689941


The DNN was finally equally good with article length of 5000
