diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 28e84b1..a160eb4 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -3,6 +3,8 @@ flask>=2.0,<2.1 gensim==3.8.0 numpy==1.19.5 spacy[ja]==3.3.0 +tokenizers==0.10.3; python_version == '3.6' +sudachipy==0.6.0; python_version == '3.6' tqdm==4.50.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py index 0ed3f87..43acb16 100644 --- a/custom-recipes/named-entity-recognition-extract/recipe.py +++ b/custom-recipes/named-entity-recognition-extract/recipe.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import multiprocessing + import dataiku from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config @@ -57,7 +59,11 @@ def compute_entities_df(df): out_df = df.merge(out_df, left_index=True, right_index=True) return out_df +if ner_model == "spacy": + chunksize = 200 * multiprocessing.cpu_count() +else: + chunksize = 100 process_dataset_chunks( - input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100 + input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize ) diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py index 6bbd619..df66247 100644 --- a/python-lib/ner_utils_spacy.py +++ b/python-lib/ner_utils_spacy.py @@ -22,7 +22,7 @@ def get_spacy_model(language: str): raise ValueError(f"The language {language} is not available. \ You can add the language & corresponding model name by editing the code.") try: - nlp = spacy.load(language_model) + nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) except OSError: # Raising ValueError instead of OSError so it shows up at the top of the log raise ValueError(f"Could not find spaCy model for the language {language}. \ @@ -32,8 +32,7 @@ def get_spacy_model(language: str): def extract_entities(text_column, format: bool, language: str): # Tag sentences nlp = get_spacy_model(language=language) - docs = nlp.pipe(text_column.values) - + docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100) # Extract entities rows = [] for doc in docs: