Skip to content
Permalink
Browse files

fix(service): raise except when empty chunk

  • Loading branch information...
hanxiao committed Aug 29, 2019
1 parent 31bffeb commit 4efea7263dc4558eb74dd2544715ba1fb0d5312d
Showing with 9 additions and 6 deletions.
  1. +1 −1 gnes/preprocessor/text/split.py
  2. +3 −0 gnes/service/encoder.py
  3. +5 −5 gnes/service/indexer.py
@@ -23,7 +23,7 @@

class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self,
min_sent_len: int = 8,
min_sent_len: int = 1,
max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool = False,
@@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
embeds = None

for d in docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)

for c in d.chunks:
chunks.append(c)
if d.doc_type == gnes_pb2.Document.TEXT:
@@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
for d in msg.request.index.docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)
else:
vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)

0 comments on commit 4efea72

Please sign in to comment.
You can’t perform that action at this time.