# Only for inference

In [1]:
import h5py
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

  from ._conv import register_converters as _register_converters


In [2]:
import _pickle
y_vocab = _pickle.load(open('./data/y_vocab.py3.cPickle', 'rb'))
y_vocab['43>109>1576>-1']
print(len(y_vocab))

4215


In [3]:
token_to_cate = {}
for it in y_vocab.items():
     token_to_cate[it[1]] = it[0]

## import utils

In [4]:
from myUtils.myUtils import *

In [5]:
# for validation
path = './data_org/dev.chunk.01'
h = h5py.File(path,'r')

[i.decode('utf8') for i in h['dev']['product'][:10] ]

['GIGABYTE 미니PC GB-BACE-3160 (램 4G+HDD 500GB) w',
 '와코루 [WACOAL]와코루 튤레이스 홑겹 B컵브라 2칼라 (NB.SP)-DBR0156',
 '카렉스  블랙스2 핸들커버(실버) 아반떼XD',
 '[뉴에라]MLB 도트 프린트 뉴욕 양키스 티셔츠 화이트(11502825)',
 '[플러그피트니스] 네오플랜 삼각아령5kg/아령/여자아령/여성아령/팔운동/여성덤벨 [FROG]',
 '[보리보리] 아트박스 POOM(문구,팬시)# (스켓쥐) (옵티머스G PRO) Eunme-Woolf 디자인 하드케이스',
 '[신한 6% 청구할인][ 더 큰 할인] 스타일엔터 베지터블워싱 포켓시스템 남자라이더자켓 가죽자켓 가죽점퍼 (핸드메이드 맞춤제작)',
 '블루독베이비[롯데백화점]도트BI내의[47A70-054-130]',
 '머레이 음표 블루투스 스피커 QP-1002',
 '타라바 루퍼스-103 스키/스노우보드 고글']

In [6]:
%%time
ht = Reader(path)
df = ht.makeDF(0,len(h['dev']['pid']), mode='dev')
df.head()

Wall time: 10.4 s


In [7]:
# 664848
# 일단 트레이닝 단어 갯수에 맞춰야 모델이 안변함
vocab = np.zeros(664848)
len(vocab) 

664848

In [8]:
from collections import Counter
import tensorflow as tf
import matplotlib.pyplot as plt
import mmh3
import re

In [9]:
# word to id
seq_len = 30

data_x = []
for i in tqdm(list(zip(df['product'], df['model'], df['brand'], df['maker']))):
    sentence = re.sub(r'[^\w]', ' ', str.join(' ', i)).split(' ')
    sentence = set(filter(None, sentence))
    # hash --> word to id
    word_ids = [mmh3.hash(word, seed=2018)%(len(vocab)+1) for word in sentence][:seq_len]
    word_ids = np.pad(word_ids, (0,seq_len-len(word_ids)), 'constant', constant_values=(0))
    data_x.append(word_ids)
  
data_x = np.array(data_x, dtype=np.int32)
data_x.shape

A Jupyter Widget




(507783, 30)

## build model

In [10]:
# make DAG
tf.reset_default_graph()
tf.set_random_seed(777)

# train Parameters
# seq_len = 35
output_dim = 4215 # 1번 트레이닝셋만

epoch = 70
batch_size = 1024
vocabulary_size = len(vocab)
embedding_size = 256


Y = tf.placeholder(tf.int16, [None, output_dim], name="label")
lr = tf.placeholder(tf.float32, [], name='learning_rate')
keep_prob = tf.placeholder(tf.float32, [], name="keep_prob")

#  embedding
X = tf.placeholder(tf.int32, [None, seq_len], name="word_tokens") 
word_embeddings = tf.get_variable("word_embeddings",
    [vocabulary_size, embedding_size], initializer=tf.contrib.layers.xavier_initializer())
embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, X) # batch * seq * embeddding

#  dropout layer
def _sequence_dropout(step_inputs, keep_prob):
        # apply dropout to each input
        # input : a list of input tensor which shape is [None, input_dim]
        with tf.name_scope('sequence_dropout') as scope:
            step_outputs = []
            for t, _input in enumerate(step_inputs):
                step_outputs.append( tf.nn.dropout(_input, keep_prob) )
        return step_outputs

embedded_word_ids = tf.unstack(embedded_word_ids, axis=1)
step_inputs = _sequence_dropout(embedded_word_ids, keep_prob) # seq * batch * embedding


#  FCN layer
doc_mean = tf.reduce_mean(step_inputs, axis=0) # batch * embedding (mean)
# hint = tf.placeholder(tf.float32, [None, 609], name='hint') # previous category
# bf_lenear = tf.concat([doc_mean, hint], axis=1) # batch * (embediing + hint)
# bf_lenear = tf.nn.dropout(bf_lenear, keep_prob)

# Y_pred = tf.contrib.layers.fully_connected(bf_lenear, output_dim, activation_fn=tf.nn.relu)  # We use the last cell's output
Y_pred = tf.contrib.layers.fully_connected(doc_mean, output_dim, activation_fn=None)  # We use the last cell's output

# # image feature
# X = tf.placeholder(tf.float32, [None, len(data_x[0])], name="img_feat")
# X = tf.nn.dropout(X, keep_prob)
# Y_pred = tf.contrib.layers.fully_connected(X, output_dim,
#                                            activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer())  # We use the last cell's output

# optimize
cost =tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=Y_pred, labels=Y, name='cross_entropy'))
optimizer = tf.train.AdamOptimizer(lr)
train_step = optimizer.minimize(cost)

# prediction
predicted = tf.argmax(Y_pred, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, tf.argmax(Y, 1)), dtype=tf.float32))

# saver
name_to_var_map = {var.op.name: var for var in tf.global_variables()}
saver = tf.train.Saver(name_to_var_map, name='my_saver')

In [11]:
# restore model
if 'sess' in globals(): sess.close()
sess = tf.InteractiveSession()

ckpt_path = './small_model/small'
saver.restore(sess, ckpt_path+'-785')

INFO:tensorflow:Restoring parameters from ./small_model/small-785


## Make .tsv file

In [12]:
from math import ceil

chunk_size = 10000
iter_num = ceil(len(data_x)/chunk_size)
cate_predicted = []

for cnt in range(iter_num):
    dev = sess.run([predicted], feed_dict={X:data_x[chunk_size*cnt:chunk_size*(cnt+1)], keep_prob:1})
    for i in dev[0]:
        tmp = token_to_cate[i]
        cate_predicted.append(tmp.replace('>', '\t'))
    
assert len(cate_predicted) == len(df['pid'])
with open("baseline.predict.tsv", "w") as f:
    for el in zip(df['pid'], cate_predicted):
#         print(el)
        tmp = '\t'.join(el)      
        f.write(tmp+'\n')