Skip to content
Permalink
Browse files

fix(preprocessor): fix SentSplitPreprocessor

  • Loading branch information...
hanxiao committed Aug 29, 2019
1 parent 522c5a4 commit 5828d20a3cadb9d0cbf640e231db90138ffe4e92
Showing with 17 additions and 6 deletions.
  1. +16 −5 gnes/preprocessor/text/split.py
  2. +1 −1 tests/test_preprocessor.py
@@ -15,23 +15,34 @@

import json
import re
import string

from ..base import BaseTextPreprocessor
from ...proto import gnes_pb2


class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, max_sent_len: int = 256, *args, **kwargs):
def __init__(self, max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool= False,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.max_sent_len = max_sent_len
self.deliminator = deliminator
self.is_json = is_json

def apply(self, doc: 'gnes_pb2.Document') -> None:
super().apply(doc)
d = json.loads(doc.raw_bytes.decode())
doc.raw_text = d.pop('Content')
doc.meta_info = json.dumps(d).encode()
d = doc.raw_bytes.decode()
if self.is_json:
d = json.loads(d)
doc.raw_text = d.pop('Content')
doc.meta_info = json.dumps(d).encode()
else:
doc.raw_text = d

ret = [(m.group(0), m.start(), m.end()) for m in re.finditer(r'[^.!?]+[.!?]', doc.raw_text)]
ret = [(m.group(0), m.start(), m.end()) for m in
re.finditer(r'[^{0}]+[{0}]'.format(self.deliminator), doc.raw_text)]
for ci, (r, s, e) in enumerate(ret):
f = ''.join(filter(lambda x: x in string.printable, r))
f = re.sub('\n+', ' ', f).strip()
@@ -49,7 +49,7 @@ def test_preprocessor_service_realdata(self):
for v in fp:
if v.strip():
d = msg.request.train.docs.add()
d.raw_text = v
d.raw_bytes = v.encode()
all_text += v
with PreprocessorService(args), ZmqClient(c_args) as client:
client.send_message(msg)

0 comments on commit 5828d20

Please sign in to comment.
You can’t perform that action at this time.