# Only for inference

In [5]:
import h5py
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

In [6]:
import _pickle
y_vocab = _pickle.load(open('./data_org/y_vocab.py3.cPickle', 'rb'))
y_vocab['43>109>1576>-1']
print(len(y_vocab))

token_to_cate = {}
for it in y_vocab.items():
     token_to_cate[it[1]] = it[0]

4215


In [7]:
x_vocab = pd.read_csv('data_org/char.csv', encoding='utf8')
x_vocab = x_vocab.as_matrix()
#  3000개 = 2999 + unk
x_vocab = {
    e[0]:i  for i,e in enumerate(x_vocab[:2998])
}
x_vocab['<PAD>'] = 2998
x_vocab['<UNK>'] = 2999

## import utils

In [12]:
import random
import h5py
import mmh3

seq_len=128
random.seed(2018)

def set_test_data(chunk_no):
    path_x = 'data_org/dev.chunk.0%d'%(chunk_no)
    h = h5py.File(path_x, 'r')
    
    mode = 'dev'
    cols = ['pid', 'product', 'model', 'brand', 'maker', 'price', 'updttm', 'bcateid', 'mcateid', 'scateid', 'dcateid']
    data = {
        c: h[mode][c][: 1000000] for c in cols
    }
    df = pd.DataFrame(data)

    for i in ['pid', 'product', 'model', 'brand', 'maker', 'updttm']:
        df[i] = df[i].apply(lambda x: x.decode('utf8'))

    df2 = pd.DataFrame(h[mode]['img_feat'][:1000000])
    return df, df2

## make model

In [9]:
# make DAG
tf.reset_default_graph()
tf.set_random_seed(2018)

# train Parameters
output_dim = 4215 # 1번 트레이닝셋만

epoch = 100
# seq_len=100
vocabulary_size = len(x_vocab) # x_vocab length
# embedding_size = 512
embedding_size = 256
features = 200


# =============================================================================
is_training = tf.placeholder(tf.bool) # 배치놈 위한 트레이닝/테스트 구분 불리언

Y = tf.placeholder(tf.int16, [None, output_dim], name="label")
lr = tf.placeholder(tf.float32, [], name='learning_rate')
keep_prob = tf.placeholder(tf.float32, [], name="keep_prob"
                          )

#  embedding
X_fw = tf.placeholder(tf.int32, [None, seq_len], name="word_tokens_fw") 
X_bw = tf.placeholder(tf.int32, [None, seq_len], name="word_tokens_bw") 
word_embeddings = tf.get_variable("word_embeddings",
    [vocabulary_size, embedding_size], initializer=tf.contrib.layers.xavier_initializer())
embedded_fw = tf.nn.embedding_lookup(word_embeddings, X_fw) # batch * seq * embeddding
embedded_bw = tf.nn.embedding_lookup(word_embeddings, X_bw) # batch * seq * embeddding
    
# image features
Xm = tf.placeholder(tf.float32, [None, 2048], name="img_feat") 

# price feature
Xp = tf.placeholder(tf.float32, [None, 2], name="price") 

"""
#  dropout layer
def _sequence_dropout(step_inputs, keep_prob):
        # apply dropout to each input
        # input : a list of input tensor which shape is [None, input_dim]
        with tf.name_scope('sequence_dropout') as scope:
            step_outputs = []
            for t, _input in enumerate(step_inputs):
                step_outputs.append( tf.nn.dropout(_input, keep_prob) )
        return step_outputs

embedded_fw = tf.unstack(embedded_fw, axis=1)
step_inputs = _sequence_dropout(embedded_fw, keep_prob) # seq * batch * embedding

#  FCN layer
doc_mean = tf.reduce_mean(step_inputs, axis=0) # batch * embedding (mean)
# hint = tf.placeholder(tf.float32, [None, 609], name='hint') # previous category
# bf_lenear = tf.concat([doc_mean, hint], axis=1) # batch * (embediing + hint)
# bf_lenear = tf.nn.dropout(bf_lenear, keep_prob)
"""

embedded_fw = tf.reshape(embedded_fw, [-1, seq_len, embedding_size, 1])
embedded_bw = tf.reshape(embedded_bw, [-1, seq_len, embedding_size, 1])
# CNN layer
def char_cnn(no, embedded, name):
  F = tf.get_variable(name, [no, embedding_size, 1, features], 
                     initializer=tf.contrib.layers.xavier_initializer()) # (row, column, channel, kernels)
  L = tf.nn.conv2d(embedded, F, strides=[1, 1, 1, 1], padding='VALID')
  L = tf.layers.batch_normalization(L, training=is_training)
  L = tf.nn.tanh(L)
  return L

C2f = char_cnn(2, embedded_fw, name='2gram-filter-fw') # batch, seq-1, 1, filters
C2fm = tf.nn.max_pool(C2f, ksize=[1, seq_len-(2-1), 1, 1], strides=[1, 1, 1, 1], padding='VALID')
C2b = char_cnn(2, embedded_bw, name='2gram-filter-bw')

C3f = char_cnn(3, embedded_fw, name='3gram-filter-fw')
C3fm = tf.nn.max_pool(C3f, ksize=[1, seq_len-(3-1), 1, 1], strides=[1, 1, 1, 1], padding='VALID')
C3b = char_cnn(3, embedded_bw, name='3gram-filter-bw')

C4f = char_cnn(4, embedded_fw, name='4gram-filter-fw')
C4fm = tf.nn.max_pool(C4f, ksize=[1, seq_len-(4-1), 1, 1], strides=[1, 1, 1, 1], padding='VALID')
C4b = char_cnn(4, embedded_bw, name='4gram-filter-bw')

# C5f = char_cnn(5, embedded_fw, name='5gram-filter-fw')
# C5fm = tf.nn.max_pool(C5f, ksize=[1, seq_len-(5-1), 1, 1], strides=[1, 1, 1, 1], padding='VALID')
# C5b = char_cnn(5, embedded_bw, name='5gram-filter-bw')

# 1*1 conv
def one_cnn(fw, bw, name):
    Concat = tf.concat([fw, bw], 3)
    F = tf.get_variable(name, [1, 1, 2*features, 1], 
                     initializer=tf.contrib.layers.xavier_initializer()) # (row, column, channel, kernels)
    L = tf.nn.conv2d(Concat, F, strides=[1, 1, 1, 1], padding='VALID')
    L = tf.layers.batch_normalization(L, training=is_training)
    L = tf.nn.tanh(L)
    return L

C2all = one_cnn(C2f, C2b, '2gram-1by1-filter')
C3all = one_cnn(C3f, C3b, '3gram-1by1-filter')
C4all = one_cnn(C4f, C4b, '4gram-1by1-filter')
C_all = tf.concat([C2all, C3all, C4all], 1)
C_all = tf.squeeze(C_all, axis=[2,3]) # (?, seq_len*3 - some)

conv_list = [C2fm, C3fm, C4fm]
C_cat = tf.concat(conv_list, 3) # (?, 1, 1, features*3) --> to attention
C_flat = tf.reshape(C_cat, [-1, len(conv_list)*features]) # (?, features*3)
C_flat = tf.contrib.layers.fully_connected(C_flat, len(conv_list)*50, activation_fn=tf.nn.elu) #(?, 50*3)

# 이미지
# Xm = tf.nn.dropout(Xm, keep_prob)
Xm2 = tf.contrib.layers.fully_connected(Xm, features, activation_fn=tf.nn.elu) # (?, features*3)

# dot product attention
Xm2_3d = tf.reshape(Xm2, [-1, features, 1])
C_cat_3d = tf.reshape(C_cat, [-1, len(conv_list), features])
A = tf.matmul(C_cat_3d, Xm2_3d) # ?, 3, 1
A_soft = tf.nn.softmax(A, axis=1)
ww = A_soft*C_cat_3d # ?, 3, 200
att = tf.reduce_sum(ww, axis=1) # ?, 200

C_flat_final = tf.concat([C_flat, C_all, att, Xm2, Xp], axis=1)

# 오캄의 면도날... 굳이 없어도 되는 레이어인듯
# L_linear = tf.contrib.layers.fully_connected(C_flat_final, 512, activation_fn=tf.nn.relu)
# L_linear = tf.nn.dropout(L_linear, keep_prob)
L_linear = tf.nn.dropout(C_flat_final, keep_prob)


# Y_pred = tf.contrib.layers.fully_connected(bf_lenear, output_dim, activation_fn=tf.nn.relu)  # We use the last cell's output
L_linear2 = tf.contrib.layers.fully_connected(L_linear, 1024, activation_fn=tf.nn.elu)  # We use the last cell's output
Y_pred = tf.contrib.layers.fully_connected(L_linear2, output_dim, activation_fn=None)  # We use the last cell's output


# # image feature
# X = tf.placeholder(tf.float32, [None, len(data_x[0])], name="img_feat")
# X = tf.nn.dropout(X, keep_prob)
# Y_pred = tf.contrib.layers.fully_connected(X, output_dim,
#                                            activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer())  # We use the last cell's output

# for batch norm
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

# optimize
cost =tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=Y_pred, labels=Y, name='cross_entropy'))
optimizer = tf.train.AdamOptimizer(lr)
with tf.control_dependencies(update_ops):
    train_step = optimizer.minimize(cost)

# prediction
predicted = tf.argmax(Y_pred, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, tf.argmax(Y, 1)), dtype=tf.float32))

# saver
name_to_var_map = {var.op.name: var for var in tf.global_variables()}
saver = tf.train.Saver(name_to_var_map, name='my_saver', max_to_keep=30)

In [10]:
# restore model
if 'sess' in globals(): sess.close()
sess = tf.InteractiveSession()

ckpt_path = './small_model/small'
saver.restore(sess, ckpt_path+'-71369')

INFO:tensorflow:Restoring parameters from ./small_model/small-71369


## Make .tsv file

In [13]:
from math import ceil

chunk_size = 5000
cate_predicted = []
pid = []

for no in [1]:
    # set test data file
    df, df2 = set_test_data(no)

    # df to data - char to token
    # df to data - char to token
    data_x_fw = []
    data_x_bw = []
    for i in tqdm(list(zip(df_val['product'], df_val['brand'], df_val['model'], df_val['maker']))):
        sentence = ' '.join(i)
        sentence = list(sentence)    
        # hash --> word to id
        word_ids_fw = [x_vocab[k] if k in x_vocab.keys() else 2999 for k in sentence][:seq_len]
        word_ids_fw = np.pad(word_ids_fw, (0,seq_len-len(word_ids_fw)), 'constant', constant_values=(2998)) # pad=2998
        data_x_fw.append(word_ids_fw)
        # revcerse
        sentence.reverse()
        word_ids_bw = [x_vocab[k] if k in x_vocab.keys() else 2999 for k in sentence][:seq_len]
        word_ids_bw = np.pad(word_ids_bw, (0,seq_len-len(word_ids_bw)), 'constant', constant_values=(2998)) # pad=2998
        data_x_bw.append(word_ids_bw)

    X_fw_val = np.array(data_x_fw)[:2000]
    X_bw_val = np.array(data_x_bw)[:2000] 
    
    iter_num = ceil(len(data_x)/chunk_size)
    pid += list(df['pid'])
    
    for cnt in range(iter_num):
        dev = sess.run([predicted], feed_dict={X:data_x[chunk_size*cnt:chunk_size*(cnt+1)], Xm:data_xm[chunk_size*cnt:chunk_size*(cnt+1)],
                                               Xp:data_xp[chunk_size*cnt:chunk_size*(cnt+1)], keep_prob:1, is_training:False})
        for i in dev[0]:
            tmp = token_to_cate[i]
            cate_predicted.append(tmp.replace('>', '\t'))

assert len(cate_predicted) == len(pid)
with open("final.predict.tsv", "w") as f:
    for el in zip(pid, cate_predicted):
#         print(el)
        tmp = '\t'.join(el)      
        f.write(tmp+'\n')

A Jupyter Widget






NameError: name 'X' is not defined