In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
df_train = pd.read_csv('train.csv', sep='\t', header=None, encoding='utf-8', names=['source', 'title'])
df_val = pd.read_csv('eval.csv', sep='\t', header=None, encoding='utf-8', names=['source', 'title'])

In [3]:
MAX_DOC_LEN = 20
THRESHOLD = 10
VOC_FILE = 'vocab.tsv'
TARGET_COL = 'source'
TEXT_COL = 'title'
PADWORD = 'ZYXW'
EMB_SIZE = 10
TARGETS = ['github', 'nytimes', 'techcrunch']
STEPS = 36000
DROP_K = .2

In [4]:
def create_vocab(trainset, textcol, vocabfilename, threshold):
    texts = trainset[textcol].values
    all_words = list()
    
    for text in texts:
        words = str(text).split(' ')
        for word in words:
            all_words.append(word)

    from collections import Counter
    word_counter = Counter(all_words)
    all_words = [word for word in all_words if (word_counter[word] > threshold) & (word != '')]
    vocab_list = list(set(all_words))
    
    with open(vocabfilename, 'w') as vfile:
        vfile.write('{}\n'.format(PADWORD))
        for word in vocab_list:
            vfile.write("{}\n".format(word))
    print('Vocabulary with {} word(s) created'.format(len(vocab_list) + 2))
    return len(vocab_list) + 2

In [5]:
voc_size = create_vocab(df_train, TEXT_COL, VOC_FILE, THRESHOLD)

Vocabulary with 6221 word(s) created


In [6]:
def text_as_idx(dataset, textcol):
    #preparing data
    table = tf.contrib.lookup.index_table_from_file(VOC_FILE, num_oov_buckets=1, default_value=-1)
    titles = dataset[textcol].values
    words = tf.string_split(titles)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    idx = table.lookup(densewords)
    padding = tf.constant([[0, 0], [0, MAX_DOC_LEN]])
    padded = tf.pad(idx, padding)
    sliced = tf.slice(padded, [0, 0], [-1, MAX_DOC_LEN])
    with tf.Session().as_default() as sess:
        sess.run(tf.tables_initializer())
        return sliced.eval()

In [7]:
def encode_labels(dataset, targetcol):
    labels = dataset[targetcol].values
    table = tf.contrib.lookup.index_table_from_tensor(mapping=tf.constant(TARGETS), num_oov_buckets=0, default_value=-1)
    targets = table.lookup(tf.convert_to_tensor(labels, tf.string))
    with tf.Session().as_default() as sess:
        table.init.run()
        return targets.eval()

In [8]:
train_len = np.array([min(len(title), MAX_DOC_LEN) for title in df_train[TEXT_COL].values])
eval_len = np.array([min(len(title), MAX_DOC_LEN) for title in df_val[TEXT_COL].values])

def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y
  
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((text_as_idx(df_train, TEXT_COL), train_len, encode_labels(df_train, TARGET_COL)))
    dataset = dataset.shuffle(buffer_size=len(df_train[TEXT_COL].values))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((text_as_idx(df_val, TEXT_COL), eval_len, encode_labels(df_val, TARGET_COL)))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [9]:
def training_and_validation(cls):
    cls.train(input_fn=train_input_fn, steps=STEPS)
    evaluation = cls.evaluate(input_fn=eval_input_fn)
    preds = np.array([p for p in cls.predict(input_fn=eval_input_fn)])
    
    tf.reset_default_graph() #to reuse variables

In [10]:
head = tf.contrib.estimator.multi_class_head(n_classes=len(TARGETS))

def text_cnn(features, labels, mode, params):
    emb_layer = tf.contrib.layers.embed_sequence(
        features['x'], voc_size, EMB_SIZE,
        initializer=params['embedding_initializer']
    )
    
    trainer = mode == tf.estimator.ModeKeys.TRAIN
    emb_drop = tf.layers.dropout(inputs=emb_layer,
                                rate=DROP_K,
                                training=trainer
                                )
    conv = tf.layers.conv1d(
      inputs=emb_drop,
      filters=EMB_SIZE//2,
      kernel_size=5,
      padding='SAME'
    )
    conv = tf.nn.relu(conv)
    
    conv = tf.layers.conv1d(
      inputs=conv,
      filters=1,
      kernel_size=MAX_DOC_LEN,
      padding='SAME'
    )
    conv = tf.nn.relu(conv)
    
    words = tf.squeeze(conv, [2])
    n_classes = len(TARGETS)
    
    logits = tf.layers.dense(words, n_classes, activation=None)
    
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    optimizer = tf.train.AdamOptimizer()
    
    def _train_op_fn(loss):
        return optimizer.minimize(
          loss=loss,
          global_step=tf.train.get_global_step()
        )
    return head.create_estimator_spec(
      features=features,
      labels=labels,
      mode=mode,
      logits=logits,
      train_op_fn=_train_op_fn
    )

In [11]:
import os 
model_dir = '.'
cparams = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}
text_cnn_cls = tf.estimator.Estimator(model_fn=text_cnn,
                                        model_dir=os.path.join(model_dir, 'text_cnn'),
                                        params=cparams)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_is_chief': True, '_task_type': 'worker', '_session_config': None, '_task_id': 0, '_master': '', '_evaluation_master': '', '_num_ps_replicas': 0, '_model_dir': './text_cnn', '_num_worker_replicas': 1, '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0fa2faf0f0>, '_save_checkpoints_steps': None, '_log_step_count_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_keep_checkpoint_max': 5, '_service': None, '_train_distribute': None}


In [12]:
training_and_validation(text_cnn_cls)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./text_cnn/model.ckpt.
INFO:tensorflow:loss = 1.2217253, step = 1
INFO:tensorflow:global_step/sec: 96.5802
INFO:tensorflow:loss = 1.0871978, step = 101 (1.041 sec)
INFO:tensorflow:global_step/sec: 95.8234
INFO:tensorflow:loss = 1.1111512, step = 201 (1.044 sec)
INFO:tensorflow:global_step/sec: 120.09
INFO:tensorflow:loss = 1.0754431, step = 301 (0.831 sec)
INFO:tensorflow:global_step/sec: 123.009
INFO:tensorflow:loss = 0.9325049, step = 401 (0.813 sec)
INFO:tensorflow:global_step/sec: 124.018
INFO:tensorflow:loss = 0.9768161, step = 501 (0.806 sec)
INFO:tensorflow:global_step/sec: 125.025
INFO:tensorflow:loss = 0.88601774, step = 601 (0.800 sec)
INFO:tensorflow:global_step/sec: 125.48
INFO:tensorflow:loss = 

In [13]:
def predict_input_fn(titles):
    temp = pd.DataFrame(titles, columns=[TEXT_COL])
    return text_as_idx(temp, TEXT_COL)

In [14]:
titles_to_pred = ['Supreme Court to Hear Major Case on Partisan Districts', 
                  'Furan -- build and push Docker images from GitHub to target', 
                  'Time Warner will spend $100M on Snapchat original shows and ads']

In [15]:
def prediction(clf, data):
    pred_len = np.array([min(len(x), MAX_DOC_LEN) for x in data])
    pred_input = tf.estimator.inputs.numpy_input_fn(x={"x": data, "len": pred_len}, shuffle=False)
    preds = [p for p in clf.predict(input_fn=pred_input)]
    return preds

In [16]:
prediction(text_cnn_cls, predict_input_fn(titles_to_pred))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./text_cnn/model.ckpt-36000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'logits': array([-4.064535 ,  4.6013837, -1.025801 ], dtype=float32),
  'probabilities': array([1.7171363e-04, 9.9624312e-01, 3.5851721e-03], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logits': array([ 4.685987 , -8.192994 , -3.9078164], dtype=float32),
  'probabilities': array([9.9981230e-01, 2.5506347e-06, 1.8521539e-04], dtype=float32)},
 {'class_ids': array([2]),
  'classes': array([b'2'], dtype=object),
  'logits': array([-10.685006 ,  -1.6643807,   0.5614947], dtype=float32),
  'probabilities': array([1.1780736e-05, 9.7449660e-02, 9.0253848e-01], dtype=float32)}]