From bd951479bc81a3fbfbaf43a7e4f1b804c9e5ce0b Mon Sep 17 00:00:00 2001 From: Shay Tsadok Date: Tue, 25 Jul 2023 14:34:34 +0300 Subject: [PATCH] Add ability to cache text prop models (#2641) * Add ability to cache text prop models * isort fix * adding test to model caching * adding a comment * adding license * adding license to test * exclude certifi.2023.7.22 * disabling the test for now * add docstrings * docstring changes --- .github/workflows/build.yml | 2 +- deepchecks/nlp/utils/text_properties.py | 206 ++------------- .../nlp/utils/text_properties_models.py | 237 ++++++++++++++++++ tests/nlp/utils/test_properties.py | 13 +- 4 files changed, 261 insertions(+), 197 deletions(-) create mode 100644 deepchecks/nlp/utils/text_properties_models.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e9b6df7b6a..59dd48ff7f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -128,7 +128,7 @@ jobs: with: requirements: 'requirements-all.txt' fail: 'Copyleft,Other,Error' - exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)' + exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.7\.22|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)' # pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE # debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE # certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py index 98d514723a..52f7708345 100644 --- a/deepchecks/nlp/utils/text_properties.py +++ b/deepchecks/nlp/utils/text_properties.py @@ -10,7 +10,6 @@ # """Module containing the text properties for the NLP module.""" import gc -import importlib import pathlib import re import string @@ -20,7 +19,6 @@ import numpy as np import pandas as pd -import requests import textblob from nltk import corpus from nltk import download as nltk_download @@ -30,6 +28,7 @@ from deepchecks.core.errors import DeepchecksValueError from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation +from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline from deepchecks.utils.function import run_available_kwargs from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list @@ -37,8 +36,6 @@ from deepchecks.utils.validation import is_sequence_not_str -MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' -FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' DEFAULT_SENTENCE_SAMPLE_SIZE = 300 MAX_CHARS = 512 # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound. # all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{|}~ - all whitespace @@ -103,159 +100,12 @@ def _sample_for_property(text: str, mode: str = 'words', limit: int = 10000, ret return ' '.join(all_units) if not return_as_list else list(all_units) -def _import_optional_property_dependency( - module: str, - property_name: str, - package_name: Optional[str] = None, - error_template: Optional[str] = None -): - try: - lib = importlib.import_module(module) - except ImportError as error: - package_name = package_name or module.split('.', maxsplit=1)[0] - error_template = error_template or ( - 'property {property_name} requires the {package_name} python package. ' - 'To get it, run:\n' - '>> pip install {package_name}\n\n' - 'You may install dependencies for all text properties by running:\n' - '>> pip install deepchecks[nlp-properties]\n' - ) - raise ImportError(error_template.format( - property_name=property_name, - package_name=package_name - )) from error - else: - return lib - - def _warn_if_missing_nltk_dependencies(dependency: str, property_name: str): """Warn if NLTK dependency is missing.""" warnings.warn(f'NLTK {dependency} not found, {property_name} cannot be calculated.' ' Please check your internet connection.', UserWarning) -def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None): - """Get the models storage directory and create it if needed.""" - if models_storage is None: - models_storage = MODELS_STORAGE - else: - if isinstance(models_storage, str): - models_storage = pathlib.Path(models_storage) - if not isinstance(models_storage, pathlib.Path): - raise ValueError( - f'Unexpected type of the "models_storage" parameter - {type(models_storage)}' - ) - if not models_storage.exists(): - models_storage.mkdir(parents=True) - if not models_storage.is_dir(): - raise ValueError('"model_storage" expected to be a directory') - - return models_storage - - -def get_transformer_model( - property_name: str, - model_name: str, - device: Optional[str] = None, - quantize_model: bool = False, - models_storage: Union[pathlib.Path, str, None] = None -): - """Get the transformer model and decide if to use optimum.onnxruntime. - - optimum.onnxruntime is used to optimize running times on CPU. - """ - models_storage = get_create_model_storage(models_storage) - - if device not in (None, 'cpu'): - transformers = _import_optional_property_dependency('transformers', property_name=property_name) - # TODO: quantize if 'quantize_model' is True - return transformers.AutoModelForSequenceClassification.from_pretrained( - model_name, - cache_dir=models_storage, - device_map=device - ) - - onnx = _import_optional_property_dependency( - 'optimum.onnxruntime', - property_name=property_name, - error_template=( - f'The device was set to {device} while computing the {property_name} property,' - 'in which case deepchecks resorts to accelerating the inference by using optimum,' - 'bit it is not installed. Either:\n' - '\t- Set the device according to your hardware;\n' - '\t- Install optimum by running "pip install optimum";\n' - '\t- Install all dependencies needed for text properties by running ' - '"pip install deepchecks[nlp-properties]";\n' - ) - ) - - if quantize_model is False: - model_path = models_storage / 'onnx' / model_name - - if model_path.exists(): - return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) - - model = onnx.ORTModelForSequenceClassification.from_pretrained( - model_name, - export=True, - cache_dir=models_storage, - ).to(device or -1) - # NOTE: - # 'optimum', after exporting/converting a model to the ONNX format, - # does not store it onto disk we need to save it now to not reconvert - # it each time - model.save_pretrained(model_path) - return model - - model_path = models_storage / 'onnx' / 'quantized' / model_name - - if model_path.exists(): - return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) - - not_quantized_model = get_transformer_model( - property_name, - model_name, - device, - quantize_model=False, - models_storage=models_storage - ) - - quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1) - - quantizer.quantize( - save_dir=model_path, - # TODO: make it possible to provide a config as a parameter - quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni( - is_static=False, - per_channel=False - ) - ) - return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) - - -def get_transformer_pipeline( - property_name: str, - model_name: str, - device: Optional[str] = None, - models_storage: Union[pathlib.Path, str, None] = None -): - """Return a transformers pipeline for the given model name.""" - transformers = _import_optional_property_dependency('transformers', property_name=property_name) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device) - model = get_transformer_model( - property_name=property_name, - model_name=model_name, - device=device, - models_storage=models_storage - ) - return transformers.pipeline( - 'text-classification', - model=model, - tokenizer=tokenizer, - device=device - ) - - def text_length(text: str) -> int: """Return text length.""" return len(text) @@ -283,36 +133,6 @@ def max_word_length(text: str) -> int: return max(len(w) for w in words) if words else 0 -def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None): - """Return fasttext model.""" - fasttext = _import_optional_property_dependency(module='fasttext', property_name='language') - - model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] - model_path = get_create_model_storage(models_storage) - model_path = model_path / 'fasttext' - - if not model_path.exists(): - model_path.mkdir(parents=True) - - model_path = model_path / model_name - - # Save the model to a file - if not model_path.exists(): - response = requests.get(FASTTEXT_LANG_MODEL, timeout=240) - if response.status_code != 200: - raise RuntimeError('Failed to donwload fasttext model') - model_path.write_bytes(response.content) - - # This weird code is to suppress a warning from fasttext about a deprecated function - try: - fasttext.FastText.eprint = lambda *args, **kwargs: None - fasttext_model = fasttext.load_model(str(model_path)) - except Exception as exp: - raise exp - - return fasttext_model - - def language( text: str, lang_certainty_threshold: float = 0.8, @@ -324,7 +144,7 @@ def language( # Load the model if it wasn't received as a parameter. This is done to avoid loading the model # each time the function is called. if fasttext_model is None: - fasttext_model = _get_fasttext_model() + fasttext_model = get_fasttext_model() # Predictions are the first prediction (k=1), only if the probability is above the threshold prediction = fasttext_model.predict(text.replace('\n', ' '), k=1, threshold=lang_certainty_threshold)[0] @@ -830,7 +650,8 @@ def calculate_builtin_properties( ignore_non_english_samples_for_english_properties: bool = True, device: Optional[str] = None, models_storage: Union[pathlib.Path, str, None] = None, - batch_size: Optional[int] = 16 + batch_size: Optional[int] = 16, + cache_models: bool = False ) -> Tuple[Dict[str, List[float]], Dict[str, str]]: """Calculate properties on provided text samples. @@ -875,6 +696,8 @@ def calculate_builtin_properties( Also, if a folder already contains relevant resources they are not re-downloaded. batch_size : int, default 8 The batch size. + cache_models : bool, default False + cache the models being used in this function, to save load time in next execution Returns ------- @@ -902,7 +725,7 @@ def calculate_builtin_properties( # Prepare kwargs for properties that require outside resources: if 'fasttext_model' not in kwargs: - kwargs['fasttext_model'] = _get_fasttext_model(models_storage=models_storage) + kwargs['fasttext_model'] = get_fasttext_model(models_storage=models_storage, use_cache=cache_models) if 'cmudict_dict' not in kwargs: properties_requiring_cmudict = list(set(CMUDICT_PROPERTIES) & set(text_properties_names)) @@ -911,20 +734,22 @@ def calculate_builtin_properties( _warn_if_missing_nltk_dependencies('cmudict', format_list(properties_requiring_cmudict)) for prop in properties_requiring_cmudict: calculated_properties[prop] = [np.nan] * len(raw_text) - cmudict_dict = corpus.cmudict.dict() - kwargs['cmudict_dict'] = cmudict_dict + kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models) if 'Toxicity' in text_properties_names and 'toxicity_classifier' not in kwargs: kwargs['toxicity_classifier'] = get_transformer_pipeline( - property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage) + property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, + models_storage=models_storage, use_cache=cache_models) if 'Formality' in text_properties_names and 'formality_classifier' not in kwargs: kwargs['formality_classifier'] = get_transformer_pipeline( - property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage) + property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, + models_storage=models_storage, use_cache=cache_models) if 'Fluency' in text_properties_names and 'fluency_classifier' not in kwargs: kwargs['fluency_classifier'] = get_transformer_pipeline( - property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage) + property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, + models_storage=models_storage, use_cache=cache_models) is_language_property_requested = 'Language' in [prop['name'] for prop in text_properties] # Remove language property from the list of properties to calculate as it will be calculated separately: @@ -994,7 +819,8 @@ def calculate_builtin_properties( sentences_cache.clear() # Clean all remaining RAM: - gc.collect() + if not cache_models: + gc.collect() if not calculated_properties: raise RuntimeError('Failed to calculate any of the properties.') diff --git a/deepchecks/nlp/utils/text_properties_models.py b/deepchecks/nlp/utils/text_properties_models.py new file mode 100644 index 0000000000..994dbd1935 --- /dev/null +++ b/deepchecks/nlp/utils/text_properties_models.py @@ -0,0 +1,237 @@ +# ---------------------------------------------------------------------------- +# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) +# +# This file is part of Deepchecks. +# Deepchecks is distributed under the terms of the GNU Affero General +# Public License (version 3 or later). +# You should have received a copy of the GNU Affero General Public License +# along with Deepchecks. If not, see . +# ---------------------------------------------------------------------------- +# +"""Module containing the text properties models for the NLP module.""" +import importlib +import pathlib +from functools import lru_cache +from typing import Optional, Union + +import requests +from nltk import corpus + +MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' + + +def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None): + """Get the models storage directory and create it if needed.""" + if models_storage is None: + models_storage = MODELS_STORAGE + else: + if isinstance(models_storage, str): + models_storage = pathlib.Path(models_storage) + if not isinstance(models_storage, pathlib.Path): + raise ValueError( + f'Unexpected type of the "models_storage" parameter - {type(models_storage)}' + ) + if not models_storage.exists(): + models_storage.mkdir(parents=True) + if not models_storage.is_dir(): + raise ValueError('"model_storage" expected to be a directory') + + return models_storage + + +def _get_transformer_model( + property_name: str, + model_name: str, + device: Optional[str] = None, + quantize_model: bool = False, + models_storage: Union[pathlib.Path, str, None] = None +): + """Get the transformer model and decide if to use optimum.onnxruntime. + + optimum.onnxruntime is used to optimize running times on CPU. + """ + models_storage = get_create_model_storage(models_storage) + + if device not in (None, 'cpu'): + transformers = import_optional_property_dependency('transformers', property_name=property_name) + # TODO: quantize if 'quantize_model' is True + return transformers.AutoModelForSequenceClassification.from_pretrained( + model_name, + cache_dir=models_storage, + device_map=device + ) + + onnx = import_optional_property_dependency( + 'optimum.onnxruntime', + property_name=property_name, + error_template=( + f'The device was set to {device} while computing the {property_name} property,' + 'in which case deepchecks resorts to accelerating the inference by using optimum,' + 'bit it is not installed. Either:\n' + '\t- Set the device according to your hardware;\n' + '\t- Install optimum by running "pip install optimum";\n' + '\t- Install all dependencies needed for text properties by running ' + '"pip install deepchecks[nlp-properties]";\n' + ) + ) + + if quantize_model is False: + model_path = models_storage / 'onnx' / model_name + + if model_path.exists(): + return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) + + model = onnx.ORTModelForSequenceClassification.from_pretrained( + model_name, + export=True, + cache_dir=models_storage, + ).to(device or -1) + # NOTE: + # 'optimum', after exporting/converting a model to the ONNX format, + # does not store it onto disk we need to save it now to not reconvert + # it each time + model.save_pretrained(model_path) + return model + + model_path = models_storage / 'onnx' / 'quantized' / model_name + + if model_path.exists(): + return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) + + not_quantized_model = _get_transformer_model( + property_name, + model_name, + device, + quantize_model=False, + models_storage=models_storage + ) + + quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1) + + quantizer.quantize( + save_dir=model_path, + # TODO: make it possible to provide a config as a parameter + quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni( + is_static=False, + per_channel=False + ) + ) + return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) + + +def import_optional_property_dependency( + module: str, + property_name: str, + package_name: Optional[str] = None, + error_template: Optional[str] = None +): + """Import additional modules in runtime.""" + try: + lib = importlib.import_module(module) + except ImportError as error: + package_name = package_name or module.split('.', maxsplit=1)[0] + error_template = error_template or ( + 'property {property_name} requires the {package_name} python package. ' + 'To get it, run:\n' + '>> pip install {package_name}\n\n' + 'You may install dependencies for all text properties by running:\n' + '>> pip install deepchecks[nlp-properties]\n' + ) + raise ImportError(error_template.format( + property_name=property_name, + package_name=package_name + )) from error + else: + return lib + + +def get_transformer_pipeline( + property_name: str, + model_name: str, + device: Optional[str] = None, + models_storage: Union[pathlib.Path, str, None] = None, + use_cache=False +): + """Return a transformers pipeline for the given model name.""" + if use_cache: + return _get_transformer_pipeline(property_name, model_name, device, models_storage) + # __wrapped__ is simply the function without decoration, in our case - without caching + return _get_transformer_pipeline.__wrapped__(property_name, model_name, device, models_storage) + + +@lru_cache(maxsize=5) +def _get_transformer_pipeline( + property_name: str, + model_name: str, + device: Optional[str] = None, + models_storage: Union[pathlib.Path, str, None] = None +): + """Return a transformers pipeline for the given model name.""" + transformers = import_optional_property_dependency('transformers', property_name=property_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device) + model = _get_transformer_model( + property_name=property_name, + model_name=model_name, + device=device, + models_storage=models_storage + ) + return transformers.pipeline( + 'text-classification', + model=model, + tokenizer=tokenizer, + device=device + ) + + +def get_cmudict_dict(use_cache=False): + """Return corpus as dict.""" + if use_cache: + return _get_cmudict_dict() + return _get_cmudict_dict.__wrapped__() + + +@lru_cache(maxsize=1) +def _get_cmudict_dict(): + cmudict_dict = corpus.cmudict.dict() + return cmudict_dict + + +FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' + + +def get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None, use_cache=False): + """Return fasttext model.""" + if use_cache: + return _get_fasttext_model(models_storage) + return _get_fasttext_model.__wrapped__(models_storage) + + +@lru_cache(maxsize=1) +def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None): + """Return fasttext model.""" + fasttext = import_optional_property_dependency(module='fasttext', property_name='language') + + model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] + model_path = get_create_model_storage(models_storage) + model_path = model_path / 'fasttext' + + if not model_path.exists(): + model_path.mkdir(parents=True) + + model_path = model_path / model_name + + # Save the model to a file + if not model_path.exists(): + response = requests.get(FASTTEXT_LANG_MODEL, timeout=240) + if response.status_code != 200: + raise RuntimeError('Failed to donwload fasttext model') + model_path.write_bytes(response.content) + + # This weird code is to suppress a warning from fasttext about a deprecated function + try: + fasttext.FastText.eprint = lambda *args, **kwargs: None + fasttext_model = fasttext.load_model(str(model_path)) + except Exception as exp: + raise exp + + return fasttext_model diff --git a/tests/nlp/utils/test_properties.py b/tests/nlp/utils/test_properties.py index af4f61c1da..29057de6e8 100644 --- a/tests/nlp/utils/test_properties.py +++ b/tests/nlp/utils/test_properties.py @@ -18,8 +18,9 @@ from hamcrest import * from deepchecks.core.errors import DeepchecksValueError -from deepchecks.nlp.utils.text_properties import (MODELS_STORAGE, _sample_for_property, calculate_builtin_properties, - get_transformer_model, is_english) +from deepchecks.nlp.utils.text_properties import (_sample_for_property, calculate_builtin_properties, + is_english) +from deepchecks.nlp.utils.text_properties_models import MODELS_STORAGE, _get_transformer_model def mock_fn(*args, **kwargs): # pylint: disable=unused-argument @@ -419,7 +420,7 @@ def test_properties_models_download(): model_download_time = timeit.timeit( stmt='fn()', number=1, - globals={'fn': lambda: get_transformer_model( + globals={'fn': lambda: _get_transformer_model( property_name='', model_name=model_name )} @@ -431,7 +432,7 @@ def test_properties_models_download(): assert onnx_model_path.exists() and onnx_model_path.is_dir() # Act - get_transformer_model(property_name='', model_name=model_name, quantize_model=True) + _get_transformer_model(property_name='', model_name=model_name, quantize_model=True) # Assert assert quantized_model_path.exists() and quantized_model_path.is_dir() @@ -440,7 +441,7 @@ def test_properties_models_download(): model_creation_time = timeit.timeit( stmt='fn()', number=1, - globals={'fn': lambda: get_transformer_model( + globals={'fn': lambda: _get_transformer_model( property_name='', model_name=model_name, quantize_model=True @@ -462,7 +463,7 @@ def test_properties_models_download_into_provided_directory(): onnx_model_path = MODELS_STORAGE / 'onnx' / model_name # Act - get_transformer_model(property_name='', model_name=model_name, models_storage=directory) + _get_transformer_model(property_name='', model_name=model_name, models_storage=directory) # Assert assert MODELS_STORAGE.exists() and MODELS_STORAGE.is_dir()