Skip to content

Commit

Permalink
Improve lang detection (#2506)
Browse files Browse the repository at this point in the history
* replace langdetect with a fasttext model
  • Loading branch information
noamzbr committed May 9, 2023
1 parent 9d1261c commit 122263b
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 27 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -133,3 +133,6 @@ benchmarks/results

# ignore user_id
.user_id

# nlp models
deepchecks/nlp/utils/.nlp-models
76 changes: 54 additions & 22 deletions deepchecks/nlp/utils/text_properties.py
Expand Up @@ -17,6 +17,7 @@

import numpy as np
import pandas as pd
import requests
import textblob
from nltk import download as nltk_download

Expand All @@ -26,6 +27,7 @@


MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'


def _import_optional_property_dependency(
Expand Down Expand Up @@ -53,17 +55,8 @@ def _import_optional_property_dependency(
return lib


def get_transformer_model(
property_name: str,
model_name: str,
device: Optional[str] = None,
quantize_model: bool = False,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Get the transformer model and decide if to use optimum.onnxruntime.
optimum.onnxruntime is used to optimize running times on CPU.
"""
def get_creat_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
"""Get the models storage directory and create it if needed."""
if models_storage is None:
models_storage = MODELS_STORAGE
else:
Expand All @@ -78,6 +71,22 @@ def get_transformer_model(
if not models_storage.is_dir():
raise ValueError('"model_storage" expected to be a directory')

return models_storage


def get_transformer_model(
property_name: str,
model_name: str,
device: Optional[str] = None,
quantize_model: bool = False,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Get the transformer model and decide if to use optimum.onnxruntime.
optimum.onnxruntime is used to optimize running times on CPU.
"""
models_storage = get_creat_model_storage(models_storage)

if device not in (None, 'cpu'):
transformers = _import_optional_property_dependency('transformers', property_name=property_name)
# TODO: quantize if 'quantize_model' is True
Expand Down Expand Up @@ -192,18 +201,41 @@ def max_word_length(raw_text: Sequence[str]) -> List[int]:
return [max([len(word) for word in text.split()]) for text in raw_text]


def language(raw_text: Sequence[str]) -> List[str]:
def language(raw_text: Sequence[str],
models_storage: Union[pathlib.Path, str, None] = None,
lang_certainty_threshold: float = 0.8
) -> List[str]:
"""Return list of strings of language."""
langdetect = _import_optional_property_dependency(module='langdetect', property_name='language')
langdetect.DetectorFactory.seed = 42
fasttext = _import_optional_property_dependency(module='fasttext', property_name='language')

result = []
for text in raw_text:
try:
result.append(langdetect.detect(text))
except langdetect.lang_detect_exception.LangDetectException:
result.append(np.nan)
return result
model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]

model_path = get_creat_model_storage(models_storage)
model_path = model_path / 'fasttext'
if not model_path.exists():
model_path.mkdir(parents=True)
model_path = model_path / model_name

# Save the model to a file
if not model_path.exists():
response = requests.get(FASTTEXT_LANG_MODEL)
with open(model_path, 'wb') as f:
f.write(response.content)

# This weird code is to suppress a warning from fasttext about a deprecated function
try:
fasttext.FastText.eprint = lambda *args, **kwargs: None
model = fasttext.load_model(model_path)
except Exception as exp:
raise exp

# Predictions are the first prediction (k=1), only if the probability is above the threshold
predictions = model.predict(list(raw_text), k=1, threshold=lang_certainty_threshold)

# x is empty for detection below threshold
language_codes = [x[0].replace('__label__', '') if x else np.nan for x in predictions[0]]

return language_codes


def sentiment(raw_text: Sequence[str]) -> List[str]:
Expand Down Expand Up @@ -318,7 +350,7 @@ def unique_noun_count(raw_text: Sequence[str]) -> List[str]:
{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
)

LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count']
LONG_RUN_PROPERTIES = ['Toxicity', 'Fluency', 'Formality', 'Unique Noun Count']
ENGLISH_ONLY_PROPERTIES = ['Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality']
LARGE_SAMPLE_SIZE = 10_000

Expand Down
6 changes: 3 additions & 3 deletions requirements/nlp-prop-requirements.txt
@@ -1,3 +1,3 @@
transformers==4.27.4
optimum[onnxruntime]==1.7.3
langdetect==1.0.9
transformers>=4.0.0
optimum[onnxruntime]>=1.7.0
fasttext>=0.8.0
1 change: 1 addition & 0 deletions requirements/requirements.txt
Expand Up @@ -31,3 +31,4 @@ plotly>=5.13.1
matplotlib>=3.3.4
pyzmq<24.0.0
beautifulsoup4>=4.11.1
requests>=2.22.0
1 change: 0 additions & 1 deletion requirements/vision-requirements.txt
Expand Up @@ -2,7 +2,6 @@ pytorch-ignite>=0.4.8
opencv-python>=4.5.5.62
albumentations>=1.1.0
imgaug>=0.4.0
requests>=2.22.0
seaborn>=0.1.0
imagehash>=4.0.0
lxml>=4.0.0
4 changes: 3 additions & 1 deletion spelling-allowlist.txt
Expand Up @@ -149,5 +149,7 @@ nltk
Tokenize
spacy
tokenizers
Uncomment
fasttext
misclassified
Uncomment
Uncomment

0 comments on commit 122263b

Please sign in to comment.