Permalink
Browse files

Use original word2vec format to serialize weight instead of pickle

  • Loading branch information...
unnonouno committed Nov 11, 2015
1 parent df28d8a commit 9864ca1caeec07cec2b6ff381106ae3051a9e84e
Showing with 20 additions and 9 deletions.
  1. +14 −4 examples/word2vec/search.py
  2. +6 −5 examples/word2vec/train_word2vec.py
@@ -1,15 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
import numpy import numpy
import six import six
import six.moves.cPickle as pickle
n_result = 5 # number of search result to show n_result = 5 # number of search result to show
with open('model.pickle', 'rb') as f: with open('word2vec.model', 'r') as f:
model, index2word, word2index = pickle.load(f) ss = f.readline().split()
n_vocab, n_units = int(ss[0]), int(ss[1])
word2index = {}
index2word = {}
w = numpy.empty((n_vocab, n_units), dtype=numpy.float32)
for i, line in enumerate(f):
ss = line.split()
assert len(ss) == n_units + 1
word = ss[0]
word2index[word] = i
index2word[i] = word
w[i] = numpy.array([float(s) for s in ss[1:]], dtype=numpy.float32)
w = model.embed.W
s = numpy.sqrt((w * w).sum(1)) s = numpy.sqrt((w * w).sum(1))
w /= s.reshape((s.shape[0], 1)) # normalize w /= s.reshape((s.shape[0], 1)) # normalize
@@ -9,7 +9,6 @@
import time import time
import numpy as np import numpy as np
import six.moves.cPickle as pickle
import chainer import chainer
from chainer import cuda from chainer import cuda
@@ -192,7 +191,9 @@ def calculate_loss(model, dataset, offset):
print(accum_loss) print(accum_loss)
model.to_cpu() with open('word2vec.model', 'w') as f:
with open('model.pickle', 'wb') as f: f.write('%d %d\n' % (len(index2word), args.unit))
obj = (model, index2word, word2index) w = model.embed.W.data
pickle.dump(obj, f) for i in range(w.shape[0]):
v = ' '.join(['%f' % v for v in w[i]])
f.write('%s %s\n' % (index2word[i], v))

0 comments on commit 9864ca1

Please sign in to comment.