Skip to content
This repository was archived by the owner on Feb 22, 2020. It is now read-only.

Commit 1bbc435

Browse files
author
hanhxiao
committed
revert(service): revert encoder service
1 parent 35fa3ba commit 1bbc435

File tree

2 files changed

+25
-26
lines changed

2 files changed

+25
-26
lines changed

gnes/service/encoder.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from typing import List, Union
1818

19-
from .base import BaseService as BS, MessageHandler, ServiceError
19+
from .base import BaseService as BS, MessageHandler
2020
from ..proto import gnes_pb2, array2blob, blob2array
2121

2222

@@ -35,11 +35,13 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
3535
docs = [docs]
3636

3737
contents = []
38-
ids = []
39-
embeds = None
38+
chunks = []
4039

4140
for d in docs:
42-
ids.append(len(d.chunks))
41+
if not d.chunks:
42+
self.logger.warning('document (doc_id=%s) contains no chunks!' % d.doc_id)
43+
continue
44+
4345
for c in d.chunks:
4446
if d.doc_type == gnes_pb2.Document.TEXT:
4547
contents.append(c.text)
@@ -48,34 +50,32 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
4850
else:
4951
self.logger.warning(
5052
'chunk content is in type: %s, dont kow how to handle that, ignored' % c.WhichOneof('content'))
53+
chunks.append(c)
5154

52-
if do_encoding:
53-
embeds = self._model.encode(contents)
54-
if sum(ids) != embeds.shape[0]:
55-
raise ServiceError(
56-
'mismatched %d chunks and a %s shape embedding, '
57-
'the first dimension must be the same' % (sum(ids), embeds.shape))
58-
idx = 0
59-
for d in docs:
60-
for c in d.chunks:
55+
if do_encoding and contents:
56+
try:
57+
embeds = self._model.encode(contents)
58+
if len(chunks) != embeds.shape[0]:
59+
self.logger.error(
60+
'mismatched %d chunks and a %s shape embedding, '
61+
'the first dimension must be the same' % (len(chunks), embeds.shape))
62+
for idx, c in enumerate(chunks):
6163
c.embedding.CopyFrom(array2blob(embeds[idx]))
62-
idx += 1
64+
except Exception as ex:
65+
self.logger.error(ex, exc_info=True)
66+
self.logger.warning('encoder service throws an exception, '
67+
'the sequel pipeline may not work properly')
6368

64-
return contents, embeds
69+
return contents
6570

6671
@handler.register(gnes_pb2.Request.IndexRequest)
6772
def _handler_index(self, msg: 'gnes_pb2.Message'):
68-
_, embeds = self.embed_chunks_in_docs(msg.request.index.docs)
69-
idx = 0
70-
for d in msg.request.index.docs:
71-
for c in d.chunks:
72-
c.embedding.CopyFrom(array2blob(embeds[idx]))
73-
idx += 1
73+
self.embed_chunks_in_docs(msg.request.index.docs)
7474

7575
@handler.register(gnes_pb2.Request.TrainRequest)
7676
def _handler_train(self, msg: 'gnes_pb2.Message'):
7777
if msg.request.train.docs:
78-
contents, _ = self.embed_chunks_in_docs(msg.request.train.docs, do_encoding=False)
78+
contents = self.embed_chunks_in_docs(msg.request.train.docs, do_encoding=False)
7979
self.train_data.extend(contents)
8080
msg.response.train.status = gnes_pb2.Response.PENDING
8181
# raise BlockMessage
@@ -88,5 +88,4 @@ def _handler_train(self, msg: 'gnes_pb2.Message'):
8888

8989
@handler.register(gnes_pb2.Request.QueryRequest)
9090
def _handler_search(self, msg: 'gnes_pb2.Message'):
91-
_, embeds = self.embed_chunks_in_docs(msg.request.search.query, is_input_list=False)
92-
msg.request.search.query.chunk_embeddings.CopyFrom(array2blob(embeds))
91+
self.embed_chunks_in_docs(msg.request.search.query, is_input_list=False)

gnes/service/indexer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
5353
offsets += [c.offset for c in d.chunks]
5454
weights += [c.weight for c in d.chunks]
5555

56-
# self.logger.info('%d %d %d %d' % (len(vecs), len(doc_ids), len(offsets), len(weights)))
57-
# self.logger.info(np.stack(vecs).shape)
5856
if vecs:
5957
self._model.add(list(zip(doc_ids, offsets)), np.stack(vecs), weights)
58+
else:
59+
self.logger.warning('chunks contain no embedded vectors, %the indexer will do nothing')
6060

6161
def _handler_doc_index(self, msg: 'gnes_pb2.Message'):
6262
self._model.add([d.doc_id for d in msg.request.index.docs],

0 commit comments

Comments
 (0)