From 41197617c43ec7a4df4e6a053d844c38782a176c Mon Sep 17 00:00:00 2001
From: Gaurav <giganticgemmic@gmail.com>
Date: Fri, 17 Jan 2020 22:49:20 +0530
Subject: [PATCH] fix en tokenization bug

---
 inltk/inltk.py     | 2 +-
 inltk/tokenizer.py | 5 ++++-
 setup.py           | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/inltk/inltk.py b/inltk/inltk.py
index 16837f0..2043443 100644
--- a/inltk/inltk.py
+++ b/inltk/inltk.py
@@ -143,7 +143,7 @@ def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, d
     word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores]
     word_ids = [ids.tolist() for ids in word_ids]
     for i, ids in enumerate(word_ids):
-        word_ids[i].remove(token_ids[i])
+        word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
     # generating more variations than required so that we can then filter out the best ones
     buffer_multiplicity = 2
     new_sen_tokens = []
diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py
index 31e848c..82844a3 100644
--- a/inltk/tokenizer.py
+++ b/inltk/tokenizer.py
@@ -35,7 +35,10 @@ def __init__(self, lang: str):
         self.tok = SpacyTokenizer(lang)
 
     def tokenizer(self, t: str) -> List[str]:
-        return self.tok.tokenizer(t)
+        tok = Tokenizer()
+        tokens = tok.process_text(t, self.tok)
+        tokens = [token for token in tokens if token not in defaults.text_spec_tok]
+        return tokens
 
     def numericalize(self, t: str):
         token_ids = self.tokenizer(t)
diff --git a/setup.py b/setup.py
index a5d7b90..e64cd67 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="inltk",
-    version="0.8",
+    version="0.8.1",
     author="Gaurav",
     author_email="contactgauravforwork@gmail.com",
     description="Natural Language Toolkit for Indian Languages (iNLTK)",