# Yoon-Kim Model  (Convolutional Neural Networks for Sentence Classification)

### paper : https://www.aclweb.org/anthology/D14-1181

In [0]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [0]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
EMB_SIZE = 128
BATCH_SIZE = 50 # 16 -> 50
NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [0]:
def mapping_fn(X, Y):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(50) # 16 -> 50
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [0]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

    embedding_layer = tf.keras.layers.Embedding(
                    VOCAB_SIZE,
                    EMB_SIZE)(features['x'])
    
    dropout_emb = tf.keras.layers.Dropout(rate = 0.5)(embedding_layer) 
    
    filter_sizes = [3, 4, 5]
    pooled_outputs = []
    for filter_size in filter_sizes:
      with tf.name_scope("conv-maxpool-%s" % filter_size):
        conv = tf.keras.layers.Conv1D(
          filters=100, 
          kernel_size= filter_size, 
          padding='valid',
          activation=tf.nn.relu,
          kernel_constraint = tf.keras.constraints.max_norm(3.))(dropout_emb)

        pool = tf.keras.layers.GlobalMaxPool1D()(conv)
        pooled_outputs.append(pool)

    h_pool = tf.concat(pooled_outputs, axis=1)
        
    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu, kernel_constraint=tf.keras.constraints.max_norm(3.))(h_pool) 
    dropout_hidden = tf.keras.layers.Dropout(rate=0.5)(hidden, training = TRAIN) # dropout 0.2 -> 0.5
    logits = tf.keras.layers.Dense(units=1)(dropout_hidden)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

In [12]:
!rm -rf data_out;ls;ls -al ./data_in

adc.json  data_in  gdrive  sample_data
total 46768
drwxr-xr-x 2 root root     4096 Jun 13 16:01 .
drwxr-xr-x 1 root root     4096 Jun 13 16:05 ..
-rw-r--r-- 1 root root   830764 Jun 13 16:01 data_configs.json
-rw-r--r-- 1 root root  1600128 Jun 13 16:01 nsmc_test_input.npy
-rw-r--r-- 1 root root   400128 Jun 13 16:01 nsmc_test_label.npy
-rw-r--r-- 1 root root  4800128 Jun 13 16:01 nsmc_train_input.npy
-rw-r--r-- 1 root root  1200128 Jun 13 16:01 nsmc_train_label.npy
-rw-r--r-- 1 root root  4893335 Jun 13 15:28 ratings_test.txt
-rw-r--r-- 1 root root 14628807 Jun 13 15:28 ratings_train.txt
-rw-r--r-- 1 root root 19515078 Jun 13 15:28 ratings.txt


In [13]:
est = tf.estimator.Estimator(model_fn, model_dir="data_out/checkpoint/yoon_kim")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'data_out/checkpoint/yoon_kim', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8d16c89630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

Experiment started at 16:05:26
.......................................
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into data_out/checkpoint/yoon_kim/model.ckpt.
INFO:tensorflow:loss = 0.6941691, step = 1
INFO:tensorflow:global_step/sec: 9.37608
INFO:tensorflow:loss = 0.5220495, step = 101 (10.667 sec)
INFO:tensorflow:global_step/sec: 9.6836
INFO:tensorflow:loss = 0.52051526, step = 201 (10.326 sec)
INFO:tensorflow:global_step/sec: 9.79918
INFO:tensorflow:loss = 0.34156135, step = 301 (10.205 sec)
INFO:tensorflow:global_step/sec: 9.70905
INFO:tensorflow:loss = 0.42577058, step = 401 (10

In [15]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-13T16:30:18Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/yoon_kim/model.ckpt-2700
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-06-13-16:30:20
INFO:tensorflow:Saving dict for global step 2700: acc = 0.82766664, global_step = 2700, loss = 0.38894233
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2700: data_out/checkpoint/yoon_kim/model.ckpt-2700


In [0]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [0]:
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_input_data, test_label_data))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [19]:
test_output = [pred['prob'] for pred in est.predict(test_input_fn)]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/yoon_kim/model.ckpt-2700
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [0]:
test_output = np.array(test_output)

In [22]:
test_output

array([[0.98701674],
       [0.5009283 ],
       [0.43605068],
       ...,
       [0.7733355 ],
       [0.4220454 ],
       [0.4441663 ]], dtype=float32)