Skip to content

Commit

Permalink
add english support in inltk
Browse files Browse the repository at this point in the history
add english to inltk
  • Loading branch information
goru001 committed Jan 17, 2020
2 parents a6c8808 + 4f4bcb4 commit d42dfa2
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 7 deletions.
11 changes: 7 additions & 4 deletions inltk/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ class LanguageCodes:
sanskrit = 'sa'
tamil = 'ta'
urdu = 'ur'
english = 'en'

def get_all_language_codes(self):
return [self.bengali, self.gujarati, self.hindi,
self.kannada, self.malyalam, self.marathi,
self.nepali, self.odia, self.panjabi,
self.sanskrit, self.tamil, self.urdu]
self.sanskrit, self.tamil, self.urdu, self.english]


class LMConfigs:
Expand All @@ -33,7 +34,8 @@ class LMConfigs:
all_language_codes.panjabi: 'https://www.dropbox.com/s/ejiv5pdsi2mhhxa/export.pkl?raw=1',
all_language_codes.sanskrit: 'https://www.dropbox.com/s/4ay1by5ryz6k39l/sanskrit_export.pkl?raw=1',
all_language_codes.tamil: 'https://www.dropbox.com/s/88klv70zl82u39b/export.pkl?raw=1',
all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1'
all_language_codes.urdu: 'https://www.dropbox.com/s/0ovetjk27np0fcz/urdu_export.pkl?raw=1',
all_language_codes.english: 'https://www.dropbox.com/s/fnzfz23tukv3aku/export.pkl?raw=1'
}
tokenizer_model_file_url = {
all_language_codes.bengali: 'https://www.dropbox.com/s/29h7vqme1kb8pmw/bengali_lm.model?raw=1',
Expand All @@ -47,7 +49,8 @@ class LMConfigs:
all_language_codes.panjabi: 'https://www.dropbox.com/s/jxwr9ytn0zfzulc/panjabi_lm.model?raw=1',
all_language_codes.sanskrit: 'https://www.dropbox.com/s/e13401nsekulq17/tokenizer.model?raw=1',
all_language_codes.tamil: 'https://www.dropbox.com/s/jpg4kaqyfb71g1v/tokenizer.model?raw=1',
all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1'
all_language_codes.urdu: 'https://www.dropbox.com/s/m5l1yy41ij6vwxa/urdu_lm.model?raw=1',
all_language_codes.english: 'https://www.dropbox.com/s/2u3greusrnyh7qy/vocab.pkl?raw=1'
}

def __init__(self, language_code: str):
Expand All @@ -58,7 +61,7 @@ def get_config(self):
'lm_model_url': self.lm_model_file_url[self.language_code],
'lm_model_file_name': 'export.pkl',
'tokenizer_model_url': self.tokenizer_model_file_url[self.language_code],
'tokenizer_model_file_name': 'tokenizer.model'
'tokenizer_model_file_name': 'vocab.pkl' if self.language_code == LMConfigs.all_language_codes.english else 'tokenizer.model'
}


Expand Down
7 changes: 5 additions & 2 deletions inltk/inltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from inltk.download_assets import setup_language, verify_language, check_all_languages_identifying_model
from inltk.tokenizer import LanguageTokenizer
from inltk.const import tokenizer_special_cases
from inltk.utils import cos_sim, reset_models
from inltk.utils import cos_sim, reset_models, is_english

lcodes = LanguageCodes()
all_language_codes = lcodes.get_all_language_codes()
Expand Down Expand Up @@ -45,7 +45,8 @@ def predict_next_words(input: str, n_words: int, language_code: str, randomness=
learn = load_learner(path / 'models' / f'{language_code}')
output = learn.predict(input, n_words, randomness)
# UTF-8 encoding takes care of both LTR and RTL languages
output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ')
if language_code != LanguageCodes.english:
output = input + (''.join(output.replace(input, '').split(' '))).replace('▁', ' ')
for special_str in tokenizer_special_cases:
output = output.replace(special_str, '\n')
return output
Expand All @@ -59,6 +60,8 @@ def tokenize(input: str, language_code: str):


def identify_language(input: str):
if is_english(input):
return 'en'
asyncio.set_event_loop(asyncio.new_event_loop())
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(check_all_languages_identifying_model())]
Expand Down
47 changes: 46 additions & 1 deletion inltk/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,56 @@
import sentencepiece as spm
from pathlib import Path

from inltk.config import LanguageCodes

path = Path(__file__).parent


class LanguageTokenizer(BaseTokenizer):
def __init__(self, lang: str):
self.lang = lang
self.base = EnglishTokenizer(lang) if lang == LanguageCodes.english else IndicTokenizer(lang)

def tokenizer(self, t: str) -> List[str]:
return self.base.tokenizer(t)

def numericalize(self, t: str) -> List[int]:
return self.base.numericalize(t)

def textify(self, ids: List[int]) -> str:
return self.base.textify(ids)

def remove_foreign_tokens(self, t: str):
return self.base.remove_foreign_tokens(t)


# Because we're using spacy tokenizer for english and sentence-piece for other languages
class EnglishTokenizer(BaseTokenizer):
def __init__(self, lang: str):
super().__init__(lang)
self.lang = lang
with open(path / f'models/{lang}/vocab.pkl', 'rb') as f:
self.vocab = Vocab(pickle.load(f))
self.tok = SpacyTokenizer(lang)

def tokenizer(self, t: str) -> List[str]:
return self.tok.tokenizer(t)

def numericalize(self, t: str):
token_ids = self.tokenizer(t)
return self.vocab.numericalize(token_ids)

def textify(self, ids: List[int]):
return self.vocab.textify(ids)

def remove_foreign_tokens(self, t: str):
local_pieces = []
for i in self.numericalize(t):
local_pieces.append(self.textify([i]))
return local_pieces


class IndicTokenizer(BaseTokenizer):
def __init__(self, lang: str):
self.lang = lang
self.sp = spm.SentencePieceProcessor()
Expand All @@ -27,7 +73,6 @@ def remove_foreign_tokens(self, t: str):
local_pieces.append(self.sp.IdToPiece(i))
return local_pieces


class AllLanguageTokenizer(LanguageTokenizer):
def __init__(self, lang: str):
LanguageTokenizer.__init__(self, lang)
Expand Down
9 changes: 9 additions & 0 deletions inltk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ def reset_models(folder_name: str):
path = Path(__file__).parent
shutil.rmtree(path / 'models' / f'{folder_name}')
return


def is_english(s: str) -> bool:
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True

0 comments on commit d42dfa2

Please sign in to comment.