# Quora Question Pairs Estimator Model MaLSTM

# 관련 Package 불러오기

In [1]:
# tensorflow import
import tensorflow as tf

# others import
import json
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
DATA_PATH = '/home/evo_mind/.kaggle/competitions/quora-question-pairs/'

BATCH_SIZE = 1024
EPOCH = 50
HIDDEN = 50
BUFFER_SIZE = 2048

TEST_SPLIT = 0.1
RNG_SEED = 13371447

WORD_EMBEDDING_DIM = 100
# CONV_FEATURE_DIM = 300
# CONV_OUTPUT_DIM = 128
# CONV_WINDOW_SIZE = 3

# SIMILARITY_DENSE_FEATURE_DIM = 200


In [3]:
q1Data = np.load(open(DATA_PATH + Q1_TRAINING_DATA_FILE, 'rb'))
q2Data = np.load(open(DATA_PATH + Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(DATA_PATH + LABEL_TRAINING_DATA_FILE, 'rb'))

In [4]:
prepro_configs = None

with open(DATA_PATH + NB_WORDS_DATA_FILE, 'r') as file:
    prepro_configs = json.load(file)

In [5]:
X = np.stack((q1Data, q2Data), axis=1)
Y = labels

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SPLIT, random_state=RNG_SEED)

Q1Train = X_train[:,0]
Q2Train = X_train[:,1]
Q1Test = X_test[:,0]
Q2Test = X_test[:,1]

In [6]:
def contrastive_loss(y,d):
    tmp= y *tf.square(d)
    #tmp= tf.mul(y,tf.square(d))
    tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0))
    return tf.reduce_sum(tmp +tmp2)/batch_size/2

In [7]:
def rearrange(base, hypothesis, label):
    features = {"base": base, "hypothesis": hypothesis}
    return features, label

In [8]:
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((Q1Train, Q2Train, Y_train))
    dataset = dataset.shuffle(buffer_size=BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [9]:
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((Q1Test, Q2Test, Y_test))
    dataset = dataset.shuffle(buffer_size=BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [10]:
VOCAB_SIZE = prepro_configs['nbWords']

In [11]:
def model_fn(features, labels, mode, params):
    
    wordEmbeddings = tf.get_variable('wordEmbeddings', [VOCAB_SIZE, WORD_EMBEDDING_DIM])
    def lstm_network(inputs, reuse=tf.AUTO_REUSE):
        rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN, reuse=reuse)
        outputs, states = tf.nn.dynamic_rnn(rnn_cell, inputs, dtype=tf.float32)
        return outputs[:, -1, :]
    
    baseEmbeddedMatrix = tf.nn.embedding_lookup(wordEmbeddings, features['base'])
    hypothesisEmbeddedMatrix = tf.nn.embedding_lookup(wordEmbeddings, features['hypothesis'])
    distance  = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(lstm_network(baseEmbeddedMatrix),lstm_network(hypothesisEmbeddedMatrix)),2),1,keep_dims=True))
    distance = tf.squeeze(distance, -1)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        globalStep = tf.train.get_global_step()
        loss = tf.losses.mean_squared_error(labels, distance)
        trainOp = tf.train.AdadeltaOptimizer(1e-4).minimize(loss, globalStep)
        accuracy = tf.metrics.accuracy(labels, distance)
        evalMetricOps = {'acc':accuracy}    
        
        return tf.estimator.EstimatorSpec(
            mode=mode,
            train_op=trainOp,
            loss=loss,
            eval_metric_ops=evalMetricOps
        )
    elif mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob':distance
            }
        )
    else: 
        return 0;

In [12]:
estimator = tf.estimator.Estimator(model_fn, model_dir='models_3')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models_3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f12569ca908>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
estimator.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models_3/model.ckpt-2
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
