# Ma LSTM

In [0]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

In [0]:
!ls -al

total 40
drwxr-xr-x 1 root root 4096 Jun 21 02:43 .
drwxr-xr-x 1 root root 4096 Jun 21 02:42 ..
-rw-r--r-- 1 root root 2487 Jun 21 02:43 adc.json
drwxr-xr-x 1 root root 4096 Jun 21 02:43 .config
drwxr-xr-x 2 root root 4096 Jun 21 02:48 data_in
drwxr-xr-x 3 root root 4096 Jun 21 03:00 data_out
drwx------ 3 root root 4096 Jun 21 02:43 gdrive
drwxr-xr-x 1 root root 4096 Jun 18 16:14 sample_data


In [0]:
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
NB_WORDS_DATA_FILE = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 4096
EPOCH = 2
HIDDEN = 64

DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13371447
EMBEDDING_DIM = 128
MAX_SEQ_LEN = 31

In [0]:
## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다.

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f:
    prepro_configs = json.load(f)

In [0]:
VOCAB_SIZE = prepro_configs['vocab_size']
VOCAB_SIZE

76558

### Split train and test dataset

In [0]:
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])

In [0]:
## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는
## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다.

X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [0]:

def rearrange(base, hypothesis, labels):
    features = {"base": base, "hypothesis": hypothesis}
    return features, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=len(train_Q1))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH) # 2
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

### Model Setup

In [0]:
def Malstm(features, labels, mode):
        
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    # VOCAB_SIZE : 76558   EMBEDDING_DIM : 128       
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM) 
    
    base_embedded_matrix = embedding(features['base']) # (?, 31, 128)
    
    hypothesis_embedded_matrix = embedding(features['hypothesis']) # (?, 31, 128)
    
    q_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = HIDDEN, activation = tf.nn.tanh , state_is_tuple=False)
    
    q, q_output_states = tf.nn.dynamic_rnn(cell = q_lstm_cell, #  LSTMStateTuple(c(?, 64), h(?, 64))
                                           inputs = base_embedded_matrix, # shape=(?, 128)
                                           dtype = tf.float32,
                                           scope='query')       
    
    s_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = HIDDEN, activation = tf.nn.tanh, state_is_tuple=False) 

    s, s_output_states = tf.nn.dynamic_rnn(cell = s_lstm_cell, 
                                           inputs = base_embedded_matrix, 
                                           dtype = tf.float32,
                                           scope='sim_query')     

    with tf.variable_scope('output_layer'):
        logit_layer = tf.exp(-tf.reduce_sum(tf.abs(q_output_states - s_output_states), axis=1, keepdims=True)) # (?, 1)
        logit_layer = tf.squeeze(logit_layer, axis=-1) #(?,)
      
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':logit_layer
                  })
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.to_float(labels)
    
    loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)

    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    elif TRAIN:

        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [0]:
!rm -rf /checkpoint/malstm/

In [0]:
model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/malstm/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()

lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir)

### 아래 루틴 엄청 오래 걸림

In [0]:
lstm_est.train(train_input_fn)

W0621 03:54:24.027960 140177333548928 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4881c518>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


embedding:  <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f7d488615c0>
base_embedded_matrix:  Tensor("embedding/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
hypothesis_embedded_matrix:  Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
q_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4881c518>


W0621 03:54:24.393178 140177333548928 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d488297f0>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


q:  Tensor("query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
q_output_states:  Tensor("query/while/Exit_3:0", shape=(?, 128), dtype=float32)
s_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d488297f0>
s:  Tensor("sim_query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
s_output_states:  Tensor("sim_query/while/Exit_3:0", shape=(?, 128), dtype=float32)
logit_layer:  Tensor("output_layer/Exp:0", shape=(?, 1), dtype=float32)
logit_layer:  Tensor("output_layer/Squeeze:0", shape=(?,), dtype=float32)
loss:  Tensor("mean_squared_error/value:0", shape=(), dtype=float32)


W0621 03:54:25.687252 140177333548928 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7f7d4a0bf080>

In [0]:
lstm_est.evaluate(eval_input_fn)

W0621 04:02:13.202722 140177333548928 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4bd9dd68>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


embedding:  <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f7d51a2e278>
base_embedded_matrix:  Tensor("embedding/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
hypothesis_embedded_matrix:  Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
q_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4bd9dd68>


W0621 04:02:13.590102 140177333548928 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4bd96978>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


q:  Tensor("query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
q_output_states:  Tensor("query/while/Exit_3:0", shape=(?, 128), dtype=float32)
s_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d4bd96978>
s:  Tensor("sim_query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
s_output_states:  Tensor("sim_query/while/Exit_3:0", shape=(?, 128), dtype=float32)
logit_layer:  Tensor("output_layer/Exp:0", shape=(?, 1), dtype=float32)
logit_layer:  Tensor("output_layer/Squeeze:0", shape=(?,), dtype=float32)
loss:  Tensor("mean_squared_error/value:0", shape=(), dtype=float32)


{'acc': 0.72096604, 'global_step': 396, 'loss': 0.18823466}

In [0]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [0]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)

predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn=predict_input_fn)])

W0621 04:11:10.219111 140177333548928 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0621 04:11:10.222349 140177333548928 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0621 04:11:10.259637 140177333548928 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d50c74e48>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tu

embedding:  <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f7d523a4668>
base_embedded_matrix:  Tensor("embedding/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
hypothesis_embedded_matrix:  Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
q_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d50c74e48>
q:  Tensor("query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
q_output_states:  Tensor("query/while/Exit_3:0", shape=(?, 128), dtype=float32)
s_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f7d50c74e10>
s:  Tensor("sim_query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
s_output_states:  Tensor("sim_query/while/Exit_3:0", shape=(?, 128), dtype=float32)
logit_layer:  Tensor("output_layer/Exp:0", shape=(?, 1), dtype=float32)
logit_layer:  Tensor("output_layer/Squeeze:0", shape=(?,), dtype=float32)


W0621 04:11:10.743502 140177333548928 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py:875: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [0]:
print(len(predictions)) #2345796

output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv( "/content/gdrive/My Drive/Colab Notebooks/Data/rnn_predict.csv", index=False, quoting=3 )

2345796
