From f32d033579e60678c03915c372320fe035361132 Mon Sep 17 00:00:00 2001
From: Gaurav <giganticgemmic@gmail.com>
Date: Fri, 13 Dec 2019 21:09:51 +0530
Subject: [PATCH 1/2] add english to inltk

---
 inltk/config.py    | 11 +++++++----
 inltk/inltk.py     |  8 +++++---
 inltk/tokenizer.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/inltk/config.py b/inltk/config.py
index c522ead..d6315d7 100644
--- a/inltk/config.py
+++ b/inltk/config.py
@@ -11,12 +11,13 @@ class LanguageCodes:
     sanskrit = 'sa'
     tamil = 'ta'
     urdu = 'ur'
+    english = 'en'
 
     def get_all_language_codes(self):
         return [self.bengali, self.gujarati, self.hindi,
                 self.kannada, self.malyalam, self.marathi,
                 self.nepali, self.odia, self.panjabi,
-                self.sanskrit, self.tamil, self.urdu]
+                self.sanskrit, self.tamil, self.urdu, self.english]
 
 
 class LMConfigs:
@@ -33,7 +34,8 @@ class LMConfigs:
         all_language_codes.panjabi: 'https://www.dropbox.com/s/ejiv5pdsi2mhhxa/export.pkl?raw=1',
         all_language_codes.sanskrit: 'https://www.dropbox.com/s/4ay1by5ryz6k39l/sanskrit_export.pkl?raw=1',
         all_language_codes.tamil: 'https://www.dropbox.com/s/88klv70zl82u39b/export.pkl?raw=1',
-        all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1'
+        all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1',
+        all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1'
     }
     tokenizer_model_file_url = {
         all_language_codes.bengali: 'https://www.dropbox.com/s/29h7vqme1kb8pmw/bengali_lm.model?raw=1',
@@ -47,7 +49,8 @@ class LMConfigs:
         all_language_codes.panjabi: 'https://www.dropbox.com/s/jxwr9ytn0zfzulc/panjabi_lm.model?raw=1',
         all_language_codes.sanskrit: 'https://www.dropbox.com/s/e13401nsekulq17/tokenizer.model?raw=1',
         all_language_codes.tamil: 'https://www.dropbox.com/s/jpg4kaqyfb71g1v/tokenizer.model?raw=1',
-        all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1'
+        all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1',
+        all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1'
     }
 
     def __init__(self, language_code: str):
@@ -58,7 +61,7 @@ def get_config(self):
             'lm_model_url': self.lm_model_file_url[self.language_code],
             'lm_model_file_name': 'export.pkl',
             'tokenizer_model_url': self.tokenizer_model_file_url[self.language_code],
-            'tokenizer_model_file_name': 'tokenizer.model'
+            'tokenizer_model_file_name': 'vocab.pkl' if self.language_code == LMConfigs.all_language_codes.english else 'tokenizer.model'
         }
 
 
diff --git a/inltk/inltk.py b/inltk/inltk.py
index bdd567c..56de145 100644
--- a/inltk/inltk.py
+++ b/inltk/inltk.py
@@ -43,7 +43,8 @@ def predict_next_words(input: str, n_words: int, language_code: str, randomness=
     learn = load_learner(path / 'models' / f'{language_code}')
     output = learn.predict(input, n_words, randomness)
     # UTF-8 encoding takes care of both LTR and RTL languages
-    output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ')
+    if language_code != LanguageCodes.english:
+        output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ')
     for special_str in tokenizer_special_cases:
         output = output.replace(special_str, '\n')
     return output
@@ -105,8 +106,9 @@ def get_sentence_encoding(input: str, language_code: str):
     defaults.device = torch.device('cpu')
     path = Path(__file__).parent
     learn = load_learner(path / 'models' / f'{language_code}')
-    m = learn.model
-    kk0 = m[0](Tensor([token_ids]).to(torch.int64))
+    awd_lstm = learn.model[0]
+    awd_lstm.reset()
+    kk0 = awd_lstm(Tensor([token_ids]).to(torch.int64))
     return np.array(kk0[0][-1][0][-1])
 
 
diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py
index db9f773..31e848c 100644
--- a/inltk/tokenizer.py
+++ b/inltk/tokenizer.py
@@ -2,10 +2,56 @@
 import sentencepiece as spm
 from pathlib import Path
 
+from inltk.config import LanguageCodes
+
 path = Path(__file__).parent
 
 
 class LanguageTokenizer(BaseTokenizer):
+    def __init__(self, lang: str):
+        self.lang = lang
+        self.base = EnglishTokenizer(lang) if lang == LanguageCodes.english else IndicTokenizer(lang)
+
+    def tokenizer(self, t: str) -> List[str]:
+        return self.base.tokenizer(t)
+
+    def numericalize(self, t: str) -> List[int]:
+        return self.base.numericalize(t)
+
+    def textify(self, ids: List[int]) -> str:
+        return self.base.textify(ids)
+
+    def remove_foreign_tokens(self, t: str):
+        return self.base.remove_foreign_tokens(t)
+
+
+# Because we're using spacy tokenizer for english and sentence-piece for other languages
+class EnglishTokenizer(BaseTokenizer):
+    def __init__(self, lang: str):
+        super().__init__(lang)
+        self.lang = lang
+        with open(path / f'models/{lang}/vocab.pkl', 'rb') as f:
+            self.vocab = Vocab(pickle.load(f))
+        self.tok = SpacyTokenizer(lang)
+
+    def tokenizer(self, t: str) -> List[str]:
+        return self.tok.tokenizer(t)
+
+    def numericalize(self, t: str):
+        token_ids = self.tokenizer(t)
+        return self.vocab.numericalize(token_ids)
+
+    def textify(self, ids: List[int]):
+        return self.vocab.textify(ids)
+
+    def remove_foreign_tokens(self, t: str):
+        local_pieces = []
+        for i in self.numericalize(t):
+            local_pieces.append(self.textify([i]))
+        return local_pieces
+
+
+class IndicTokenizer(BaseTokenizer):
     def __init__(self, lang: str):
         self.lang = lang
         self.sp = spm.SentencePieceProcessor()
@@ -27,7 +73,6 @@ def remove_foreign_tokens(self, t: str):
             local_pieces.append(self.sp.IdToPiece(i))
         return local_pieces
 
-
 class AllLanguageTokenizer(LanguageTokenizer):
     def __init__(self, lang: str):
         LanguageTokenizer.__init__(self, lang)

From 4f4bcb4975779614bc744754a699057b1b5ea56a Mon Sep 17 00:00:00 2001
From: Gaurav <giganticgemmic@gmail.com>
Date: Fri, 17 Jan 2020 09:39:22 +0530
Subject: [PATCH 2/2] add identify language support for en

---
 inltk/inltk.py | 4 +++-
 inltk/utils.py | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/inltk/inltk.py b/inltk/inltk.py
index f2ba475..16837f0 100644
--- a/inltk/inltk.py
+++ b/inltk/inltk.py
@@ -8,7 +8,7 @@
 from inltk.download_assets import setup_language, verify_language, check_all_languages_identifying_model
 from inltk.tokenizer import LanguageTokenizer
 from inltk.const import tokenizer_special_cases
-from inltk.utils import cos_sim, reset_models
+from inltk.utils import cos_sim, reset_models, is_english
 
 lcodes = LanguageCodes()
 all_language_codes = lcodes.get_all_language_codes()
@@ -60,6 +60,8 @@ def tokenize(input: str, language_code: str):
 
 
 def identify_language(input: str):
+    if is_english(input):
+        return 'en'
     asyncio.set_event_loop(asyncio.new_event_loop())
     loop = asyncio.get_event_loop()
     tasks = [asyncio.ensure_future(check_all_languages_identifying_model())]
diff --git a/inltk/utils.py b/inltk/utils.py
index 1bce3fd..a64ce42 100644
--- a/inltk/utils.py
+++ b/inltk/utils.py
@@ -10,3 +10,12 @@ def reset_models(folder_name: str):
     path = Path(__file__).parent
     shutil.rmtree(path / 'models' / f'{folder_name}')
     return
+
+
+def is_english(s: str) -> bool:
+    try:
+        s.encode(encoding='utf-8').decode('ascii')
+    except UnicodeDecodeError:
+        return False
+    else:
+        return True