In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
from random import randint
import gensim
import pickle
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

This notebook shows the development of an LSTM model using TensorFlow. Data is read from preprocessed texts in a CSV, transformed into the appropriate form for the model, then fed into an LSTM.

Note that due to computational constraints, this was entirely done on the University of Potsdam GPU servers.

# Import Data

In [2]:
%cd '/data/hyperpartisan-news-detection'

/data/hyperpartisan-news-detection


In [3]:
X_test = pd.read_csv('df/test_df.csv')['text']
y_test = pd.read_csv('df/test_df.csv')['label']
X_train = pd.read_csv('df/train_df.csv')['text']
y_train = pd.read_csv('df/train_df.csv')['label']
X_byarticle = pd.read_csv('df/byarticle_df.csv')['text']
y_byarticle = pd.read_csv('df/byarticle_df.csv')['label']

In [3]:
X_test = pd.read_csv('df/test_agree_df.csv')['text']
y_test = pd.read_csv('df/test_agree_df.csv')['label']
X_train = pd.read_csv('df/train_agree_df.csv')['text']
y_train = pd.read_csv('df/train_agree_df.csv')['label']
X_byarticle = pd.read_csv('df/byarticle_df.csv')['text']
y_byarticle = pd.read_csv('df/byarticle_df.csv')['label']
#X_byarticle = pd.read_csv('df/byarticle_agree_df.csv')['text']
#y_byarticle = pd.read_csv('df/byarticle_agree_df.csv')['label']

# finish data input

In [4]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
y_test = np.array(enc.fit_transform(np.array(y_test).reshape(-1, 1)))
y_train = np.array(enc.fit_transform(np.array(y_train).reshape(-1, 1)))
#y_train_main = np.array(enc.fit_transform(np.array(y_train_main).reshape(-1, 1)))
#y_train_holdout = np.array(enc.fit_transform(np.array(y_train_holdout).reshape(-1, 1)))
y_byarticle = np.array(enc.fit_transform(np.array(y_byarticle).reshape(-1, 1)))
#y_clean = np.array(enc.fit_transform(np.array(y_clean).reshape(-1, 1)))

In [5]:
#X_total = np.array(X_total)
#X_clean = np.array(X_clean)
X_train = np.array(X_train)
X_test = np.array(X_test)
#X_train_main = np.array(X_train_main)
#X_train_holdout = np.array(X_train_holdout)
X_byarticle = np.array(X_byarticle)

In [6]:
X_total = np.concatenate((X_train, X_test, X_byarticle))
y_total = np.concatenate((y_train, y_test, y_byarticle))

## Prepare word embedding matrices (one-time necessity)

# Model

In [7]:
batch_size = 16
lstm_units = 50
num_classes = 2
max_sent_length = 100
iterations = 16000 #8k for train-main, 12k for total

In [8]:
#embedding_matrix = np.load('word-embeddings/glove100_vectors.npy')
#word2idx = pickle.load(open("word-embeddings/glove100_indices.pkl", "rb" ))
embedding_matrix = np.load('word-embeddings/embed_total_vec.npy')
word2idx = pickle.load(open("word-embeddings/embed_total_idx.pkl", "rb" ))

word_embed_size = embedding_matrix.shape[1]
embed_vocab_size = embedding_matrix.shape[0]

embedding_matrix = np.append(embedding_matrix, np.zeros([1, word_embed_size]), axis=0)
embedding_matrix.shape

(457197, 50)

In [9]:
def nextBatch(batch_size, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    Takes list of words, transforms to padded, uniform length vectors of word indices in embedding.
    '''
    
    data_shuffle = np.zeros([batch_size, max_sent_length])
    labels_shuffle = np.zeros([batch_size, 2])
    
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:batch_size]
    
    for i in range(batch_size):
        if type(data[idx[i]]) == str:
            sent = word_tokenize(data[idx[i]])
        elif type(data[idx[i]]) == np.str_:
            sent = word_tokenize(data[idx[i]])
        else:
            sent = []
            
        for word_idx in range(max_sent_length):
            try:
                word = str(sent[word_idx])
                data_shuffle[i][word_idx] = word2idx[word]
            except KeyError:
                data_shuffle[i][word_idx] = embed_vocab_size
            except IndexError:
                data_shuffle[i][word_idx] = embed_vocab_size
        labels_shuffle[i] = labels[idx[i]]
        
    return data_shuffle, labels_shuffle

In [10]:
tf.reset_default_graph()

with tf.device("/cpu:0"):
    embedding_weights = tf.Variable(tf.constant(0.0, shape=[embed_vocab_size+1, word_embed_size]),trainable=True, name="embedding_weights") 
    embedding_placeholder = tf.placeholder(tf.float32, [embed_vocab_size+1, word_embed_size])
    embedding_init = embedding_weights.assign(embedding_placeholder)

x_input = tf.placeholder(tf.int64, [None, max_sent_length])
x = tf.nn.embedding_lookup(embedding_weights, x_input)
y = tf.placeholder(tf.float32, [None, num_classes], name="y")

lstm_cell = tf.contrib.rnn.LSTMCell(lstm_units, name="lstm_cell")
lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.8)
value, _ = tf.nn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)

weights = tf.Variable(tf.truncated_normal([lstm_units, num_classes]), name="weights")
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="bias")
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weights) + bias)
pred_val = tf.argmax(prediction,1)

correct = tf.equal(pred_val, tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer().minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(embedding_init, feed_dict={embedding_placeholder: embedding_matrix})

array([[-2.7258053e+00, -8.9037985e-01, -3.3093975e+00, ...,
        -3.6481032e+00,  9.2985529e-01, -3.1523368e+00],
       [-4.7811241e+00,  2.8778939e+00, -3.1557155e+00, ...,
         2.4104493e+00, -6.6318178e+00, -2.5443816e+00],
       [-4.0230713e+00, -6.2868729e-02, -3.0445645e+00, ...,
        -1.4144707e+00, -2.5390584e+00, -5.9339542e+00],
       ...,
       [ 1.1209801e-03,  4.1167468e-02, -9.9538632e-02, ...,
         5.9944548e-02,  8.0686755e-02,  1.2080799e-01],
       [-4.2032246e-03, -1.6227258e-02,  8.6567029e-03, ...,
         4.1860022e-02,  9.4838940e-02,  6.2755957e-02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00]], dtype=float32)

In [12]:
for i in range(iterations):
    batch_titles, batch_lb = nextBatch(batch_size, X_total, y_total)
    sess.run(optimizer, feed_dict = {x_input: batch_titles, y: batch_lb})
    
    if i % 500 == 0:
        print('iteration ' + str(i))
        batch_titles, batch_lb = nextBatch(500, X_train, y_train)
        print(sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb}))
        batch_titles, batch_lb = nextBatch(500, X_test, y_test)
        print(sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb}))
        batch_titles, batch_lb = nextBatch(len(X_byarticle), X_byarticle, y_byarticle)
        print(sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb}))

iteration 0
0.598
0.54
0.5472868
iteration 500
0.854
0.812
0.5534884
iteration 1000
0.912
0.888
0.5674419
iteration 1500
0.918
0.838
0.6139535
iteration 2000
0.946
0.93
0.5782946
iteration 2500
0.956
0.92
0.6139535
iteration 3000
0.954
0.908
0.5379845
iteration 3500
0.97
0.94
0.5922481
iteration 4000
0.958
0.946
0.56434107
iteration 4500
0.956
0.946
0.58294576
iteration 5000
0.962
0.944
0.56589144


KeyboardInterrupt: 

In [41]:
tf.saved_model.simple_save(sess,
            "/project/cramerus/LSTM-final",
            inputs={"Text": x_input},
            outputs={"Prediction": pred_val})

Instructions for updating:
Pass your op to the equivalent parameter main_op instead.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /project/cramerus/LSTM-final/saved_model.pb


In [None]:
# calculate final accuracy ratings
print('Accuracy on training set:') #480,000
correct = 0.0
i = 0
while i < len(X_train):
    batch_titles, batch_lb = nextBatch(1000, np.array(X_train)[i:i+1000], y_train[i:i+1000])
    acc = sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb})
    correct += 1000 * acc
    i += 1000
print(str(correct / len(X_train)))

#print('Accuracy on holdout training set:') #120,000
#correct = 0.0
#i = 0
#while i < len(X_train_holdout):
#    batch_titles, batch_lb = nextBatch(1000, np.array(X_train_holdout)[i:i+1000], y_train_holdout[i:i+1000])
#    acc = sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb})
#    correct += 1000 * acc
#    i += 1000
#print(str(correct / len(X_train_holdout)))

print('Accuracy on test set:') #150,000
correct = 0.0
i = 0
while i < len(X_test):
    batch_titles, batch_lb = nextBatch(1000, np.array(X_test)[i:i+1000], y_test[i:i+1000])
    acc = sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb})
    correct += 1000 * acc
    i += 1000
print(str(correct / len(X_test)))

print('Accuracy on baby set:')
batch_titles, batch_lb = nextBatch(len(X_byarticle), np.array(X_byarticle), y_byarticle)
print(sess.run(accuracy, feed_dict={x_input: batch_titles, y: batch_lb}))

Accuracy on training set:


In [37]:
sess.close()