In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import sqlite3

import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf

FLAGS = None

MAX_DOCUMENT_LENGTH = 20
EMBEDDING_SIZE = 100
n_words = 0
MAX_LABEL = 30
WORDS_FEATURE = 'words'  # Name of the input words feature.


In [2]:
def estimator_spec_for_softmax_classification(
    logits, labels, mode):
  """Returns EstimatorSpec instance for softmax classification."""
  predicted_classes = tf.argmax(logits, 1)
  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={
            'class': predicted_classes,
            'prob': tf.nn.softmax(logits)
        })

  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
  loss = tf.losses.softmax_cross_entropy(
      onehot_labels=onehot_labels, logits=logits)
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

  eval_metric_ops = {
      'accuracy': tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)
  }
  return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [3]:
def bag_of_words_model(features, labels, mode):
  """A bag-of-words model. Note it disregards the word order in the text."""
  bow_column = tf.feature_column.categorical_column_with_identity(
      WORDS_FEATURE, num_buckets=n_words)
  bow_embedding_column = tf.feature_column.embedding_column(
      bow_column, dimension=EMBEDDING_SIZE)
  bow = tf.feature_column.input_layer(
      features,
      feature_columns=[bow_embedding_column])
  logits = tf.layers.dense(bow, MAX_LABEL, activation=None)

  return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)


In [4]:
def rnn_model(features, labels, mode):
  """RNN model to predict from sequence of words to a class."""
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = tf.contrib.layers.embed_sequence(
      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)

  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = tf.unstack(word_vectors, axis=1)

  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
  cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

  # Create an unrolled Recurrent Neural Networks to length of
  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
  _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
  # neural network of last step) and pass it as features for softmax
  # classification over output classes.
  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
  return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)

In [5]:
def main(unused_argv):
  global n_words
  # Prepare training and testing data
  conn = sqlite3.connect('papers.db')
  test = pd.read_sql_query('SELECT * FROM papers', conn)
  target = pd.read_csv('ipfjes-case-control-studies.csv', usecols=['pmid','title','firstauthor','lastauthor',
                                                             'journal','pubdate','pubtype',
                                                             'abstract', 'keywords', 'rcr', 'citedby', 'cites'])

  x_train = pd.concat([test.head(9), target.head(9)])['abstract']
  y_train = pd.Series([0 for i in range(9)] + [1 for i in range(9)])
  x_test = pd.concat([test, target.tail(5)])['abstract']
  y_test = pd.Series([0 for i in range(len(test))] + [1 for i in range(5)])
  # Process vocabulary
  vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
      MAX_DOCUMENT_LENGTH)

  x_transform_train = vocab_processor.fit_transform(x_train)
  x_transform_test = vocab_processor.transform(x_test)

  x_train = np.array(list(x_transform_train))
  x_test = np.array(list(x_transform_test))

  n_words = len(vocab_processor.vocabulary_)
  print('Total words: %d' % n_words)

  # Build model
  # Switch between rnn_model and bag_of_words_model to test different models.
  model_fn = rnn_model
  if FLAGS.bow_model:
    # Subtract 1 because VocabularyProcessor outputs a word-id matrix where word
    # ids start from 1 and 0 means 'no word'. But
    # categorical_column_with_identity assumes 0-based count and uses -1 for
    # missing word.
    x_train -= 1
    x_test -= 1
    model_fn = bag_of_words_model
  classifier = tf.estimator.Estimator(model_fn=model_fn)

  # Train.
  train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={WORDS_FEATURE: x_train},
      y=y_train,
      batch_size=len(x_train),
      num_epochs=None,
      shuffle=True)
  classifier.train(input_fn=train_input_fn, steps=1000)

  # Predict.
  test_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={WORDS_FEATURE: x_test},
      y=y_test,
      num_epochs=1,
      shuffle=False)
  predictions = classifier.predict(input_fn=test_input_fn)
  y_predicted = np.array(list(p['class'] for p in predictions))
  y_predicted = y_predicted.reshape(np.array(y_test).shape)

  # Score with sklearn.
  score = metrics.accuracy_score(y_test, y_predicted)
  print('Accuracy (sklearn): {0:f}'.format(score))

  # Score with tensorflow.
  scores = classifier.evaluate(input_fn=test_input_fn)
  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
  return y_predicted, score

In [6]:
if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--test_with_fake_data',
      default=False,
      help='Test the example code with fake data.',
      action='store_true')
  parser.add_argument(
      '--bow_model',
      default=False,
      help='Run with BOW model instead of RNN.',
      action='store_true')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

Total words: 1127
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': 1, '_model_dir': '/tmp/tmpbj0xavnr', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_session_config': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpbj0xavnr/model.ckpt.
INFO:tensorflow:loss = 3.4121, step = 1
INFO:tensorflow:global_step/sec: 59.4199
INFO:tensorflow:loss = 0.000477391, step = 101 (1.684 sec)
INFO:tensorflow:global_step/sec: 62.7956
INFO:tensorflow:loss = 0.000199933, step = 201 (1.593 sec)
INFO:tensorflow:global_step/sec: 65.5795
INFO:tensorflow:loss = 0.000163876, step = 301 (1.525 sec)
INFO:tensorflow:global_step/sec: 63.1993
INFO:tensorflow:loss = 7.1642e-05, step = 401 (1.582 sec)
INFO:tensorflow:global_step/sec: 61.4125
INFO:tensorflow:loss = 5.95958e-05, step = 501 (1.628 sec)
INFO:tensorflo

SystemExit: (array([0, 0, 0, ..., 1, 1, 1]), 0.28087954110898661)

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [7]:
for i in range(100):
    y_predicted, score = main(0)
    if score > 0.9: break

Total words: 1127
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmp3fb7l6e6', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_tf_random_seed': 1, '_session_config': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp3fb7l6e6/model.ckpt.
INFO:tensorflow:loss = 3.41072, step = 1
INFO:tensorflow:global_step/sec: 52.5894
INFO:tensorflow:loss = 0.000809001, step = 101 (1.902 sec)
INFO:tensorflow:global_step/sec: 62.6981
INFO:tensorflow:loss = 0.000340006, step = 201 (1.595 sec)
INFO:tensorflow:global_step/sec: 60.2917
INFO:tensorflow:loss = 0.000179821, step = 301 (1.659 sec)
INFO:tensorflow:global_step/sec: 62.3484
INFO:tensorflow:loss = 0.000163363, step = 401 (1.604 sec)
INFO:tensorflow:global_step/sec: 62.4272
INFO:tensorflow:loss = 7.98524e-05, step = 501 (1.602 sec)
INFO:tensorf

INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmpdnbjyye2', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_tf_random_seed': 1, '_session_config': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpdnbjyye2/model.ckpt.
INFO:tensorflow:loss = 3.40119, step = 1
INFO:tensorflow:global_step/sec: 36.4521
INFO:tensorflow:loss = 0.000197119, step = 101 (2.745 sec)
INFO:tensorflow:global_step/sec: 50.9318
INFO:tensorflow:loss = 0.000142323, step = 201 (1.963 sec)
INFO:tensorflow:global_step/sec: 61.1325
INFO:tensorflow:loss = 8.36345e-05, step = 301 (1.638 sec)
INFO:tensorflow:global_step/sec: 61.1355
INFO:tensorflow:loss = 4.85234e-05, step = 401 (1.633 sec)
INFO:tensorflow:global_step/sec: 59.4449
INFO:tensorflow:loss = 4.25299e-05, step = 501 (1.683 sec)
INFO:tensorflow:global_step/sec: 54.1778
INFO:tensorflow:loss = 3.07

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp3t0bffw7/model.ckpt.
INFO:tensorflow:loss = 3.40137, step = 1
INFO:tensorflow:global_step/sec: 38.3699
INFO:tensorflow:loss = 0.000496889, step = 101 (2.610 sec)
INFO:tensorflow:global_step/sec: 36.2619
INFO:tensorflow:loss = 0.000213365, step = 201 (2.756 sec)
INFO:tensorflow:global_step/sec: 46.6979
INFO:tensorflow:loss = 0.000101083, step = 301 (2.141 sec)
INFO:tensorflow:global_step/sec: 48.1866
INFO:tensorflow:loss = 8.39788e-05, step = 401 (2.075 sec)
INFO:tensorflow:global_step/sec: 45.2371
INFO:tensorflow:loss = 5.89734e-05, step = 501 (2.210 sec)
INFO:tensorflow:global_step/sec: 53.5306
INFO:tensorflow:loss = 5.79536e-05, step = 601 (1.869 sec)
INFO:tensorflow:global_step/sec: 41.8288
INFO:tensorflow:loss = 4.09408e-05, step = 701 (2.390 sec)
INFO:tensorflow:global_step/sec: 44.8848
INFO:tensorflow:loss = 2.83118e-05, step = 801 (2.227 sec)
INFO:tensorflow:global_step/sec: 56.6304

INFO:tensorflow:loss = 3.409, step = 1
INFO:tensorflow:global_step/sec: 51.9291
INFO:tensorflow:loss = 0.0001813, step = 101 (1.927 sec)
INFO:tensorflow:global_step/sec: 55.9157
INFO:tensorflow:loss = 8.53762e-05, step = 201 (1.788 sec)
INFO:tensorflow:global_step/sec: 55.0123
INFO:tensorflow:loss = 4.35898e-05, step = 301 (1.818 sec)
INFO:tensorflow:global_step/sec: 65.0177
INFO:tensorflow:loss = 4.03514e-05, step = 401 (1.538 sec)
INFO:tensorflow:global_step/sec: 57.4519
INFO:tensorflow:loss = 3.00204e-05, step = 501 (1.741 sec)
INFO:tensorflow:global_step/sec: 64.6471
INFO:tensorflow:loss = 1.98746e-05, step = 601 (1.547 sec)
INFO:tensorflow:global_step/sec: 57.9048
INFO:tensorflow:loss = 1.74839e-05, step = 701 (1.727 sec)
INFO:tensorflow:global_step/sec: 63.5818
INFO:tensorflow:loss = 1.22387e-05, step = 801 (1.573 sec)
INFO:tensorflow:global_step/sec: 59.8972
INFO:tensorflow:loss = 1.11195e-05, step = 901 (1.670 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpshuxq9

INFO:tensorflow:global_step/sec: 49.8684
INFO:tensorflow:loss = 0.000228222, step = 201 (2.005 sec)
INFO:tensorflow:global_step/sec: 62.9048
INFO:tensorflow:loss = 0.000130517, step = 301 (1.590 sec)
INFO:tensorflow:global_step/sec: 64.0917
INFO:tensorflow:loss = 7.25689e-05, step = 401 (1.560 sec)
INFO:tensorflow:global_step/sec: 64.3894
INFO:tensorflow:loss = 6.68405e-05, step = 501 (1.553 sec)
INFO:tensorflow:global_step/sec: 62.9661
INFO:tensorflow:loss = 5.42254e-05, step = 601 (1.588 sec)
INFO:tensorflow:global_step/sec: 61.3305
INFO:tensorflow:loss = 4.07619e-05, step = 701 (1.631 sec)
INFO:tensorflow:global_step/sec: 61.9635
INFO:tensorflow:loss = 3.39541e-05, step = 801 (1.614 sec)
INFO:tensorflow:global_step/sec: 61.7649
INFO:tensorflow:loss = 2.99806e-05, step = 901 (1.619 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpfklwuf7e/model.ckpt.
INFO:tensorflow:Loss for final step: 2.05038e-05.
INFO:tensorflow:Restoring parameters from /tmp/tmpfklwuf7e/model.ckpt-10

INFO:tensorflow:global_step/sec: 62.0422
INFO:tensorflow:loss = 0.000229375, step = 301 (1.611 sec)
INFO:tensorflow:global_step/sec: 61.5086
INFO:tensorflow:loss = 0.000129631, step = 401 (1.626 sec)
INFO:tensorflow:global_step/sec: 53.3782
INFO:tensorflow:loss = 9.27066e-05, step = 501 (1.873 sec)
INFO:tensorflow:global_step/sec: 52.1397
INFO:tensorflow:loss = 7.15489e-05, step = 601 (1.918 sec)
INFO:tensorflow:global_step/sec: 48.7533
INFO:tensorflow:loss = 5.41458e-05, step = 701 (2.051 sec)
INFO:tensorflow:global_step/sec: 55.2506
INFO:tensorflow:loss = 5.02453e-05, step = 801 (1.810 sec)
INFO:tensorflow:global_step/sec: 47.5081
INFO:tensorflow:loss = 3.74839e-05, step = 901 (2.106 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmp_qikblgv/model.ckpt.
INFO:tensorflow:Loss for final step: 3.32853e-05.
INFO:tensorflow:Restoring parameters from /tmp/tmp_qikblgv/model.ckpt-1000
Accuracy (sklearn): 0.527342
INFO:tensorflow:Starting evaluation at 2017-07-18-23:03:39
INFO:tens

INFO:tensorflow:global_step/sec: 63.5183
INFO:tensorflow:loss = 0.000129631, step = 401 (1.574 sec)
INFO:tensorflow:global_step/sec: 64.3451
INFO:tensorflow:loss = 9.27066e-05, step = 501 (1.554 sec)
INFO:tensorflow:global_step/sec: 63.0772
INFO:tensorflow:loss = 7.15489e-05, step = 601 (1.585 sec)
INFO:tensorflow:global_step/sec: 63.06
INFO:tensorflow:loss = 5.41458e-05, step = 701 (1.586 sec)
INFO:tensorflow:global_step/sec: 62.0488
INFO:tensorflow:loss = 5.02453e-05, step = 801 (1.612 sec)
INFO:tensorflow:global_step/sec: 59.7624
INFO:tensorflow:loss = 3.74839e-05, step = 901 (1.673 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpwf1jyfs9/model.ckpt.
INFO:tensorflow:Loss for final step: 3.32853e-05.
INFO:tensorflow:Restoring parameters from /tmp/tmpwf1jyfs9/model.ckpt-1000
Accuracy (sklearn): 0.527342
INFO:tensorflow:Starting evaluation at 2017-07-18-23:05:29
INFO:tensorflow:Restoring parameters from /tmp/tmpwf1jyfs9/model.ckpt-1000
INFO:tensorflow:Finished evaluation 

INFO:tensorflow:global_step/sec: 55.9109
INFO:tensorflow:loss = 4.25299e-05, step = 501 (1.789 sec)
INFO:tensorflow:global_step/sec: 58.5364
INFO:tensorflow:loss = 3.07554e-05, step = 601 (1.708 sec)
INFO:tensorflow:global_step/sec: 62.7378
INFO:tensorflow:loss = 2.97554e-05, step = 701 (1.594 sec)
INFO:tensorflow:global_step/sec: 60.7594
INFO:tensorflow:loss = 2.51329e-05, step = 801 (1.646 sec)
INFO:tensorflow:global_step/sec: 59.8647
INFO:tensorflow:loss = 1.91528e-05, step = 901 (1.670 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmp7j3r88kh/model.ckpt.
INFO:tensorflow:Loss for final step: 1.93448e-05.
INFO:tensorflow:Restoring parameters from /tmp/tmp7j3r88kh/model.ckpt-1000
Accuracy (sklearn): 0.415679
INFO:tensorflow:Starting evaluation at 2017-07-18-23:07:09
INFO:tensorflow:Restoring parameters from /tmp/tmp7j3r88kh/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-07-18-23:07:10
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.415679, glob

INFO:tensorflow:global_step/sec: 62.5415
INFO:tensorflow:loss = 1.62521e-05, step = 601 (1.599 sec)
INFO:tensorflow:global_step/sec: 63.1958
INFO:tensorflow:loss = 1.48017e-05, step = 701 (1.582 sec)
INFO:tensorflow:global_step/sec: 59.4306
INFO:tensorflow:loss = 1.22122e-05, step = 801 (1.683 sec)
INFO:tensorflow:global_step/sec: 58.9621
INFO:tensorflow:loss = 1.21195e-05, step = 901 (1.695 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpqo0d5b9j/model.ckpt.
INFO:tensorflow:Loss for final step: 7.54327e-06.
INFO:tensorflow:Restoring parameters from /tmp/tmpqo0d5b9j/model.ckpt-1000
Accuracy (sklearn): 0.908987
INFO:tensorflow:Starting evaluation at 2017-07-18-23:08:57
INFO:tensorflow:Restoring parameters from /tmp/tmpqo0d5b9j/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-07-18-23:08:59
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.908987, global_step = 1000, loss = 0.584568
Accuracy (tensorflow): 0.908987


In [8]:
len(y_predicted)

5230

In [9]:
conn = sqlite3.connect('papers.db')
test = pd.read_sql_query('SELECT * FROM papers', conn)
len(test)

5225

In [10]:
test['pred'] = y_predicted[:5225]

In [11]:
len(test[test['pred'] == 1])

474

In [13]:
test[test['pred'] == 1].to_csv('ipf-rnn-model.csv')

In [15]:
result = test[test['pred'] == 1].head()

In [27]:
pd.set_option('max_colwidth',1000)
result[['pmid', 'date', 'author', 'title', 'journal']]

Unnamed: 0,pmid,date,author,title,journal
39,26724656,2016 Jan,"Nakagawa H, Nagatani Y, Takahashi M, Ogawa E, Tho NV, Ryujin Y, Nagao T, Nakano Y",Quantitative CT analysis of honeycombing area in idiopathic pulmonary fibrosis: Correlations with pulmonary function tests.,European journal of radiology
53,26359263,2015 Dec,"Umeda Y, Demura Y, Morikawa M, Anzai M, Kadowaki M, Ameshima S, Tsuchida T, Tsujikawa T, Kiyono Y, Okazawa H, Ishizaki T, Ishizuka T",Prognostic Value of Dual-Time-Point 18F-FDG PET for Idiopathic Pulmonary Fibrosis.,"Journal of nuclear medicine : official publication, Society of Nuclear Medicine"
58,26415518,2015 Sep 29,"Sokai A, Handa T, Tanizawa K, Oga T, Uno K, Tsuruyama T, Kubo T, Ikezoe K, Nakatsuka Y, Tanimura K, Muro S, Hirai T, Nagai S, Chin K, Mishima M",Matrix metalloproteinase-10: a novel biomarker for idiopathic pulmonary fibrosis.,Respiratory research
73,26324805,2015 Sep,"Caminati A, Madotto F, Cesana G, Conti S, Harari S",Epidemiological studies in idiopathic pulmonary fibrosis: pitfalls in methodologies and data interpretation.,European respiratory review : an official journal of the European Respiratory Society
119,25260757,2014 Nov 01,"Ley B, Brown KK, Collard HR",Molecular biomarkers in idiopathic pulmonary fibrosis.,American journal of physiology. Lung cellular and molecular physiology
