From 122263b74805e1b4ed3c73b1f07e01de904f3e84 Mon Sep 17 00:00:00 2001 From: Noam Bressler Date: Tue, 9 May 2023 16:53:49 +0300 Subject: [PATCH] Improve lang detection (#2506) * replace langdetect with a fasttext model --- .gitignore | 3 + deepchecks/nlp/utils/text_properties.py | 76 ++++++++++++++++++------- requirements/nlp-prop-requirements.txt | 6 +- requirements/requirements.txt | 1 + requirements/vision-requirements.txt | 1 - spelling-allowlist.txt | 4 +- 6 files changed, 64 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index f1539ced94..35c6ca16a0 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ benchmarks/results # ignore user_id .user_id + +# nlp models +deepchecks/nlp/utils/.nlp-models \ No newline at end of file diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index cb66d33083..bb86b35aa2 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +import requests import textblob from nltk import download as nltk_download @@ -26,6 +27,7 @@ MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' +FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' def _import_optional_property_dependency( @@ -53,17 +55,8 @@ def _import_optional_property_dependency( return lib -def get_transformer_model( - property_name: str, - model_name: str, - device: Optional[str] = None, - quantize_model: bool = False, - models_storage: Union[pathlib.Path, str, None] = None -): - """Get the transformer model and decide if to use optimum.onnxruntime. - - optimum.onnxruntime is used to optimize running times on CPU. - """ +def get_creat_model_storage(models_storage: Union[pathlib.Path, str, None] = None): + """Get the models storage directory and create it if needed.""" if models_storage is None: models_storage = MODELS_STORAGE else: @@ -78,6 +71,22 @@ def get_transformer_model( if not models_storage.is_dir(): raise ValueError('"model_storage" expected to be a directory') + return models_storage + + +def get_transformer_model( + property_name: str, + model_name: str, + device: Optional[str] = None, + quantize_model: bool = False, + models_storage: Union[pathlib.Path, str, None] = None +): + """Get the transformer model and decide if to use optimum.onnxruntime. + + optimum.onnxruntime is used to optimize running times on CPU. + """ + models_storage = get_creat_model_storage(models_storage) + if device not in (None, 'cpu'): transformers = _import_optional_property_dependency('transformers', property_name=property_name) # TODO: quantize if 'quantize_model' is True @@ -192,18 +201,41 @@ def max_word_length(raw_text: Sequence[str]) -> List[int]: return [max([len(word) for word in text.split()]) for text in raw_text] -def language(raw_text: Sequence[str]) -> List[str]: +def language(raw_text: Sequence[str], + models_storage: Union[pathlib.Path, str, None] = None, + lang_certainty_threshold: float = 0.8 + ) -> List[str]: """Return list of strings of language.""" - langdetect = _import_optional_property_dependency(module='langdetect', property_name='language') - langdetect.DetectorFactory.seed = 42 + fasttext = _import_optional_property_dependency(module='fasttext', property_name='language') - result = [] - for text in raw_text: - try: - result.append(langdetect.detect(text)) - except langdetect.lang_detect_exception.LangDetectException: - result.append(np.nan) - return result + model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] + + model_path = get_creat_model_storage(models_storage) + model_path = model_path / 'fasttext' + if not model_path.exists(): + model_path.mkdir(parents=True) + model_path = model_path / model_name + + # Save the model to a file + if not model_path.exists(): + response = requests.get(FASTTEXT_LANG_MODEL) + with open(model_path, 'wb') as f: + f.write(response.content) + + # This weird code is to suppress a warning from fasttext about a deprecated function + try: + fasttext.FastText.eprint = lambda *args, **kwargs: None + model = fasttext.load_model(model_path) + except Exception as exp: + raise exp + + # Predictions are the first prediction (k=1), only if the probability is above the threshold + predictions = model.predict(list(raw_text), k=1, threshold=lang_certainty_threshold) + + # x is empty for detection below threshold + language_codes = [x[0].replace('__label__', '') if x else np.nan for x in predictions[0]] + + return language_codes def sentiment(raw_text: Sequence[str]) -> List[str]: @@ -318,7 +350,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]: {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'}, ) -LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] +LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count'] ENGLISH_ONLY_PROPERTIES = ['Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality'] LARGE_SAMPLE_SIZE = 10_000 diff --git a/requirements/nlp-prop-requirements.txt b/requirements/nlp-prop-requirements.txt index aee027f381..df47ebf350 100644 --- a/requirements/nlp-prop-requirements.txt +++ b/requirements/nlp-prop-requirements.txt @@ -1,3 +1,3 @@ -transformers==4.27.4 -optimum[onnxruntime]==1.7.3 -langdetect==1.0.9 \ No newline at end of file +transformers>=4.0.0 +optimum[onnxruntime]>=1.7.0 +fasttext>=0.8.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f3e3003837..56d5f0dd6e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -31,3 +31,4 @@ plotly>=5.13.1 matplotlib>=3.3.4 pyzmq<24.0.0 beautifulsoup4>=4.11.1 +requests>=2.22.0 diff --git a/requirements/vision-requirements.txt b/requirements/vision-requirements.txt index fed17437fa..499ebf2778 100644 --- a/requirements/vision-requirements.txt +++ b/requirements/vision-requirements.txt @@ -2,7 +2,6 @@ pytorch-ignite>=0.4.8 opencv-python>=4.5.5.62 albumentations>=1.1.0 imgaug>=0.4.0 -requests>=2.22.0 seaborn>=0.1.0 imagehash>=4.0.0 lxml>=4.0.0 \ No newline at end of file diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt index 894b7cfdbb..46c46816f5 100644 --- a/spelling-allowlist.txt +++ b/spelling-allowlist.txt @@ -149,5 +149,7 @@ nltk Tokenize spacy tokenizers +Uncomment +fasttext misclassified -Uncomment \ No newline at end of file +Uncomment