diff --git a/inltk/config.py b/inltk/config.py index c522ead..d6315d7 100644 --- a/inltk/config.py +++ b/inltk/config.py @@ -11,12 +11,13 @@ class LanguageCodes: sanskrit = 'sa' tamil = 'ta' urdu = 'ur' + english = 'en' def get_all_language_codes(self): return [self.bengali, self.gujarati, self.hindi, self.kannada, self.malyalam, self.marathi, self.nepali, self.odia, self.panjabi, - self.sanskrit, self.tamil, self.urdu] + self.sanskrit, self.tamil, self.urdu, self.english] class LMConfigs: @@ -33,7 +34,8 @@ class LMConfigs: all_language_codes.panjabi: 'https://www.dropbox.com/s/ejiv5pdsi2mhhxa/export.pkl?raw=1', all_language_codes.sanskrit: 'https://www.dropbox.com/s/4ay1by5ryz6k39l/sanskrit_export.pkl?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/88klv70zl82u39b/export.pkl?raw=1', - all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1' + all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1', + all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1' } tokenizer_model_file_url = { all_language_codes.bengali: 'https://www.dropbox.com/s/29h7vqme1kb8pmw/bengali_lm.model?raw=1', @@ -47,7 +49,8 @@ class LMConfigs: all_language_codes.panjabi: 'https://www.dropbox.com/s/jxwr9ytn0zfzulc/panjabi_lm.model?raw=1', all_language_codes.sanskrit: 'https://www.dropbox.com/s/e13401nsekulq17/tokenizer.model?raw=1', all_language_codes.tamil: 'https://www.dropbox.com/s/jpg4kaqyfb71g1v/tokenizer.model?raw=1', - all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1' + all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1', + all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1' } def __init__(self, language_code: str): @@ -58,7 +61,7 @@ def get_config(self): 'lm_model_url': self.lm_model_file_url[self.language_code], 'lm_model_file_name': 'export.pkl', 'tokenizer_model_url': self.tokenizer_model_file_url[self.language_code], - 'tokenizer_model_file_name': 'tokenizer.model' + 'tokenizer_model_file_name': 'vocab.pkl' if self.language_code == LMConfigs.all_language_codes.english else 'tokenizer.model' } diff --git a/inltk/inltk.py b/inltk/inltk.py index bb7955c..16837f0 100644 --- a/inltk/inltk.py +++ b/inltk/inltk.py @@ -8,7 +8,7 @@ from inltk.download_assets import setup_language, verify_language, check_all_languages_identifying_model from inltk.tokenizer import LanguageTokenizer from inltk.const import tokenizer_special_cases -from inltk.utils import cos_sim, reset_models +from inltk.utils import cos_sim, reset_models, is_english lcodes = LanguageCodes() all_language_codes = lcodes.get_all_language_codes() @@ -45,7 +45,8 @@ def predict_next_words(input: str, n_words: int, language_code: str, randomness= learn = load_learner(path / 'models' / f'{language_code}') output = learn.predict(input, n_words, randomness) # UTF-8 encoding takes care of both LTR and RTL languages - output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ') + if language_code != LanguageCodes.english: + output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ') for special_str in tokenizer_special_cases: output = output.replace(special_str, '\n') return output @@ -59,6 +60,8 @@ def tokenize(input: str, language_code: str): def identify_language(input: str): + if is_english(input): + return 'en' asyncio.set_event_loop(asyncio.new_event_loop()) loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future(check_all_languages_identifying_model())] diff --git a/inltk/tokenizer.py b/inltk/tokenizer.py index db9f773..31e848c 100644 --- a/inltk/tokenizer.py +++ b/inltk/tokenizer.py @@ -2,10 +2,56 @@ import sentencepiece as spm from pathlib import Path +from inltk.config import LanguageCodes + path = Path(__file__).parent class LanguageTokenizer(BaseTokenizer): + def __init__(self, lang: str): + self.lang = lang + self.base = EnglishTokenizer(lang) if lang == LanguageCodes.english else IndicTokenizer(lang) + + def tokenizer(self, t: str) -> List[str]: + return self.base.tokenizer(t) + + def numericalize(self, t: str) -> List[int]: + return self.base.numericalize(t) + + def textify(self, ids: List[int]) -> str: + return self.base.textify(ids) + + def remove_foreign_tokens(self, t: str): + return self.base.remove_foreign_tokens(t) + + +# Because we're using spacy tokenizer for english and sentence-piece for other languages +class EnglishTokenizer(BaseTokenizer): + def __init__(self, lang: str): + super().__init__(lang) + self.lang = lang + with open(path / f'models/{lang}/vocab.pkl', 'rb') as f: + self.vocab = Vocab(pickle.load(f)) + self.tok = SpacyTokenizer(lang) + + def tokenizer(self, t: str) -> List[str]: + return self.tok.tokenizer(t) + + def numericalize(self, t: str): + token_ids = self.tokenizer(t) + return self.vocab.numericalize(token_ids) + + def textify(self, ids: List[int]): + return self.vocab.textify(ids) + + def remove_foreign_tokens(self, t: str): + local_pieces = [] + for i in self.numericalize(t): + local_pieces.append(self.textify([i])) + return local_pieces + + +class IndicTokenizer(BaseTokenizer): def __init__(self, lang: str): self.lang = lang self.sp = spm.SentencePieceProcessor() @@ -27,7 +73,6 @@ def remove_foreign_tokens(self, t: str): local_pieces.append(self.sp.IdToPiece(i)) return local_pieces - class AllLanguageTokenizer(LanguageTokenizer): def __init__(self, lang: str): LanguageTokenizer.__init__(self, lang) diff --git a/inltk/utils.py b/inltk/utils.py index 1bce3fd..a64ce42 100644 --- a/inltk/utils.py +++ b/inltk/utils.py @@ -10,3 +10,12 @@ def reset_models(folder_name: str): path = Path(__file__).parent shutil.rmtree(path / 'models' / f'{folder_name}') return + + +def is_english(s: str) -> bool: + try: + s.encode(encoding='utf-8').decode('ascii') + except UnicodeDecodeError: + return False + else: + return True