From f32d033579e60678c03915c372320fe035361132 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 13 Dec 2019 21:09:51 +0530 Subject: [PATCH 1/2] add english to inltk --- inltk/config.py | 11 +++++++---- inltk/inltk.py | 8 +++++--- inltk/tokenizer.py | 47 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/inltk/config.py b/inltk/config.py index c522ead..d6315d7 100644 --- a/inltk/config.py +++ b/inltk/config.py @@ -11,12 +11,13 @@ class LanguageCodes: sanskrit = 'sa' tamil = 'ta' urdu = 'ur' + english = 'en' def get_all_language_codes(self): return [self.bengali, self.gujarati, self.hindi, self.kannada, self.malyalam, self.marathi, self.nepali, self.odia, self.panjabi, - self.sanskrit, self.tamil, self.urdu] + self.sanskrit, self.tamil, self.urdu, self.english] class LMConfigs: @@ -33,7 +34,8 @@ class LMConfigs: all_language_codes.panjabi: 'https://www.dropbox.com/s/ejiv5pdsi2mhhxa/export.pkl?raw=1', all_language_codes.sanskrit: 'https://www.dropbox.com/s/4ay1by5ryz6k39l/sanskrit_export.pkl?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/88klv70zl82u39b/export.pkl?raw=1', - all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1' + all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1', + all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1' } tokenizer_model_file_url = { all_language_codes.bengali: 'https://www.dropbox.com/s/29h7vqme1kb8pmw/bengali_lm.model?raw=1', @@ -47,7 +49,8 @@ class LMConfigs: all_language_codes.panjabi: 'https://www.dropbox.com/s/jxwr9ytn0zfzulc/panjabi_lm.model?raw=1', all_language_codes.sanskrit: 'https://www.dropbox.com/s/e13401nsekulq17/tokenizer.model?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/jpg4kaqyfb71g1v/tokenizer.model?raw=1', - all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1' + all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1', + all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1' } def __init__(self, language_code: str): @@ -58,7 +61,7 @@ def get_config(self): 'lm_model_url': self.lm_model_file_url[self.language_code], 'lm_model_file_name': 'export.pkl', 'tokenizer_model_url': self.tokenizer_model_file_url[self.language_code], - 'tokenizer_model_file_name': 'tokenizer.model' + 'tokenizer_model_file_name': 'vocab.pkl' if self.language_code == LMConfigs.all_language_codes.english else 'tokenizer.model' } diff --git a/inltk/inltk.py b/inltk/inltk.py index bdd567c..56de145 100644 --- a/inltk/inltk.py +++ b/inltk/inltk.py @@ -43,7 +43,8 @@ def predict_next_words(input: str, n_words: int, language_code: str, randomness= learn = load_learner(path / 'models' / f'{language_code}') output = learn.predict(input, n_words, randomness) # UTF-8 encoding takes care of both LTR and RTL languages - output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ') + if language_code != LanguageCodes.english: + output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ') for special_str in tokenizer_special_cases: output = output.replace(special_str, '\n') return output @@ -105,8 +106,9 @@ def get_sentence_encoding(input: str, language_code: str): defaults.device = torch.device('cpu') path = Path(__file__).parent learn = load_learner(path / 'models' / f'{language_code}') - m = learn.model - kk0 = m[0](Tensor([token_ids]).to(torch.int64)) + awd_lstm = learn.model[0] + awd_lstm.reset() + kk0 = awd_lstm(Tensor([token_ids]).to(torch.int64)) return np.array(kk0[0][-1][0][-1]) diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py index db9f773..31e848c 100644 --- a/inltk/tokenizer.py +++ b/inltk/tokenizer.py @@ -2,10 +2,56 @@ import sentencepiece as spm from pathlib import Path +from inltk.config import LanguageCodes + path = Path(__file__).parent class LanguageTokenizer(BaseTokenizer): + def __init__(self, lang: str): + self.lang = lang + self.base = EnglishTokenizer(lang) if lang == LanguageCodes.english else IndicTokenizer(lang) + + def tokenizer(self, t: str) -> List[str]: + return self.base.tokenizer(t) + + def numericalize(self, t: str) -> List[int]: + return self.base.numericalize(t) + + def textify(self, ids: List[int]) -> str: + return self.base.textify(ids) + + def remove_foreign_tokens(self, t: str): + return self.base.remove_foreign_tokens(t) + + +# Because we're using spacy tokenizer for english and sentence-piece for other languages +class EnglishTokenizer(BaseTokenizer): + def __init__(self, lang: str): + super().__init__(lang) + self.lang = lang + with open(path / f'models/{lang}/vocab.pkl', 'rb') as f: + self.vocab = Vocab(pickle.load(f)) + self.tok = SpacyTokenizer(lang) + + def tokenizer(self, t: str) -> List[str]: + return self.tok.tokenizer(t) + + def numericalize(self, t: str): + token_ids = self.tokenizer(t) + return self.vocab.numericalize(token_ids) + + def textify(self, ids: List[int]): + return self.vocab.textify(ids) + + def remove_foreign_tokens(self, t: str): + local_pieces = [] + for i in self.numericalize(t): + local_pieces.append(self.textify([i])) + return local_pieces + + +class IndicTokenizer(BaseTokenizer): def __init__(self, lang: str): self.lang = lang self.sp = spm.SentencePieceProcessor() @@ -27,7 +73,6 @@ def remove_foreign_tokens(self, t: str): local_pieces.append(self.sp.IdToPiece(i)) return local_pieces - class AllLanguageTokenizer(LanguageTokenizer): def __init__(self, lang: str): LanguageTokenizer.__init__(self, lang) From 4f4bcb4975779614bc744754a699057b1b5ea56a Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 17 Jan 2020 09:39:22 +0530 Subject: [PATCH 2/2] add identify language support for en --- inltk/inltk.py | 4 +++- inltk/utils.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/inltk/inltk.py b/inltk/inltk.py index f2ba475..16837f0 100644 --- a/inltk/inltk.py +++ b/inltk/inltk.py @@ -8,7 +8,7 @@ from inltk.download_assets import setup_language, verify_language, check_all_languages_identifying_model from inltk.tokenizer import LanguageTokenizer from inltk.const import tokenizer_special_cases -from inltk.utils import cos_sim, reset_models +from inltk.utils import cos_sim, reset_models, is_english lcodes = LanguageCodes() all_language_codes = lcodes.get_all_language_codes() @@ -60,6 +60,8 @@ def tokenize(input: str, language_code: str): def identify_language(input: str): + if is_english(input): + return 'en' asyncio.set_event_loop(asyncio.new_event_loop()) loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future(check_all_languages_identifying_model())] diff --git a/inltk/utils.py b/inltk/utils.py index 1bce3fd..a64ce42 100644 --- a/inltk/utils.py +++ b/inltk/utils.py @@ -10,3 +10,12 @@ def reset_models(folder_name: str): path = Path(__file__).parent shutil.rmtree(path / 'models' / f'{folder_name}') return + + +def is_english(s: str) -> bool: + try: + s.encode(encoding='utf-8').decode('ascii') + except UnicodeDecodeError: + return False + else: + return True