From 41197617c43ec7a4df4e6a053d844c38782a176c Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 17 Jan 2020 22:49:20 +0530 Subject: [PATCH] fix en tokenization bug --- inltk/inltk.py | 2 +- inltk/tokenizer.py | 5 ++++- setup.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/inltk/inltk.py b/inltk/inltk.py index 16837f0..2043443 100644 --- a/inltk/inltk.py +++ b/inltk/inltk.py @@ -143,7 +143,7 @@ def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, d word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores] word_ids = [ids.tolist() for ids in word_ids] for i, ids in enumerate(word_ids): - word_ids[i].remove(token_ids[i]) + word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]] # generating more variations than required so that we can then filter out the best ones buffer_multiplicity = 2 new_sen_tokens = [] diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py index 31e848c..82844a3 100644 --- a/inltk/tokenizer.py +++ b/inltk/tokenizer.py @@ -35,7 +35,10 @@ def __init__(self, lang: str): self.tok = SpacyTokenizer(lang) def tokenizer(self, t: str) -> List[str]: - return self.tok.tokenizer(t) + tok = Tokenizer() + tokens = tok.process_text(t, self.tok) + tokens = [token for token in tokens if token not in defaults.text_spec_tok] + return tokens def numericalize(self, t: str): token_ids = self.tokenizer(t) diff --git a/setup.py b/setup.py index a5d7b90..e64cd67 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="inltk", - version="0.8", + version="0.8.1", author="Gaurav", author_email="contactgauravforwork@gmail.com", description="Natural Language Toolkit for Indian Languages (iNLTK)",