In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
sess = tf.InteractiveSession()

In [3]:
masks = tf.placeholder(tf.bool, shape=[None, None], name='masks')

In [4]:
candidates_list = tf.placeholder(tf.int32, shape=[None], name='candidate_list')

In [5]:
logits = tf.constant(np.random.rand(10,3).astype(np.float32))

In [6]:
my_masks = tf.nn.embedding_lookup(masks, candidates_list)

In [7]:
minus_inf = tf.ones_like(logits)*(-np.inf)
logits2 = tf.where(my_masks, logits, minus_inf)

In [8]:
sess.run(logits)

array([[0.6151685 , 0.08352623, 0.5156483 ],
       [0.22853646, 0.16846274, 0.6718266 ],
       [0.9369456 , 0.5832673 , 0.6210752 ],
       [0.08969156, 0.8691597 , 0.5292604 ],
       [0.9495773 , 0.161454  , 0.37424776],
       [0.12067841, 0.738514  , 0.25930822],
       [0.22199138, 0.7458076 , 0.91431034],
       [0.9320803 , 0.74630034, 0.948757  ],
       [0.46516496, 0.08289133, 0.00563843],
       [0.7531522 , 0.7391109 , 0.09862112]], dtype=float32)

In [9]:
masks_val = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0]], dtype=np.float32)

In [10]:
candidates_list_val = np.random.randint(4, size=10)

In [11]:
feed_dict = {masks: masks_val, candidates_list: candidates_list_val}

In [12]:
sess.run(logits2, feed_dict=feed_dict)

array([[0.6151685 ,       -inf,       -inf],
       [      -inf,       -inf, 0.6718266 ],
       [0.9369456 ,       -inf,       -inf],
       [0.08969156, 0.8691597 ,       -inf],
       [0.9495773 ,       -inf,       -inf],
       [0.12067841,       -inf,       -inf],
       [      -inf, 0.7458076 ,       -inf],
       [0.9320803 ,       -inf,       -inf],
       [      -inf,       -inf, 0.00563843],
       [      -inf, 0.7391109 ,       -inf]], dtype=float32)

In [13]:
y_hat = tf.argmax(logits2, axis=1)
sess.run(y_hat, feed_dict=feed_dict)

array([0, 2, 0, 1, 0, 0, 1, 0, 2, 1])

In [14]:
probs = tf.nn.softmax(logits2, axis=1)
sess.run(probs, feed_dict=feed_dict)

array([[1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.31443453, 0.6855655 , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ]], dtype=float32)

In [15]:
hdn_input_path = '../output/gigaword-hdn-dev.2018-05-18-f48a06c.pkl'
gigaword_input_path = '../output/gigaword-dev.2018-05-10-7d764e7.npz'

In [16]:
buffer = np.load(gigaword_input_path)['buffer']

In [17]:
hdn_df = pd.read_pickle(hdn_input_path)

In [18]:
hdn_df.head(5)

Unnamed: 0,candidates,hdn,sent_len,sent_start,sent_stop,word_index
0,1894,37,20,25,45,6
1,2715,37,20,25,45,15
2,1584,37,22,45,67,6
3,2135,37,22,45,67,20
4,1407,37,19,67,86,3


In [19]:
hdn_df['candidates'].values

array([1894, 2715, 1584, ..., 1107,  308,  675])

In [20]:
hdn_df['sent_len'].max()

279

In [21]:
for i in  hdn_df.head()['sent_len']: print(i)

20
20
22
22
19


In [22]:
import sys
if '..' not in sys.path: sys.path.append('..')

In [23]:
from model import HDNModel

In [24]:
import vectorize_gigaword
from configs import DefaultConfig

class MyConfig(DefaultConfig):
    hdn_vocab_path = '../output/hdn-vocab.2018-05-18-f48a06c.pkl'
    hdn_list_vocab_path = '../output/hdn-list-vocab.2018-05-18-f48a06c.pkl'
    emb_dims = 64
    hidden_size = 128
    num_senses = 16
    predict_batch_size = 128000
    train_batch_size = 20
        
config = MyConfig()

In [25]:
m = HDNModel(config)

In [26]:
word2id = np.load('../output/vocab.2018-05-10-7d764e7.pkl')

In [27]:
sess.run(tf.global_variables_initializer())

In [None]:
m.predict_proba(sess, (buffer, hdn_df.head(5)), word2id)

In [None]:
m.predict(sess, (buffer, hdn_df.head(5)), word2id)

In [None]:
m.config = config

In [None]:
m.train_epoch(sess, (buffer, hdn_df.sample(n=200)), word2id)

In [None]:
m.predict(sess, (buffer, hdn_df[hdn_df['hdn']!=37].head(5)), word2id)

In [None]:
buffer.dtype

In [None]:
hdn_df[hdn_df['sent_start'] == hdn_df['sent_stop']]

In [29]:
%timeit
batches = m._gen_batches((buffer, hdn_df.iloc[:300000]), batch_size=256000, word2id=word2id)

Preparing batches: 100%|██████████| 119/119 [00:31<00:00,  3.75batch/s]


In [None]:
# sess.close()