From 951ee70d474c7caa96fce0b640268c57126098f6 Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 21 Jun 2022 09:04:23 +0200 Subject: [PATCH 1/5] Edited files --- code-env/python/spec/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 28e84b1..302e167 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -3,6 +3,7 @@ flask>=2.0,<2.1 gensim==3.8.0 numpy==1.19.5 spacy[ja]==3.3.0 +tokenizers==0.10.3; python_version == '3.6' tqdm==4.50.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz From b07f4ed99a7a6f1625c2f9458257a2c25a965152 Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 21 Jun 2022 09:17:31 +0200 Subject: [PATCH 2/5] Edited file 'code-env/python/spec/requirements.txt' --- code-env/python/spec/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 302e167..51e9830 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -4,6 +4,7 @@ gensim==3.8.0 numpy==1.19.5 spacy[ja]==3.3.0 tokenizers==0.10.3; python_version == '3.6' +sudachipy==0.5.4; python_version == '3.6' tqdm==4.50.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz From 9fe034f7e2ccf4c3f687f090f3f9a57be103a004 Mon Sep 17 00:00:00 2001 From: Nicolas Dalsass Date: Tue, 21 Jun 2022 10:57:08 +0200 Subject: [PATCH 3/5] Spacy optimizations - Disable unused pipeline algos - divides recipe time roughly by two - Allow multi-cpu processing - on a 8 core machine, divives recipe time roughly by 3 Overall, 6x improvement for spacy recipe ! --- custom-recipes/named-entity-recognition-extract/recipe.py | 8 +++++++- python-lib/ner_utils_spacy.py | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py index 0ed3f87..43acb16 100644 --- a/custom-recipes/named-entity-recognition-extract/recipe.py +++ b/custom-recipes/named-entity-recognition-extract/recipe.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import multiprocessing + import dataiku from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config @@ -57,7 +59,11 @@ def compute_entities_df(df): out_df = df.merge(out_df, left_index=True, right_index=True) return out_df +if ner_model == "spacy": + chunksize = 200 * multiprocessing.cpu_count() +else: + chunksize = 100 process_dataset_chunks( - input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100 + input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize ) diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py index 6bbd619..c8d4e98 100644 --- a/python-lib/ner_utils_spacy.py +++ b/python-lib/ner_utils_spacy.py @@ -32,8 +32,7 @@ def get_spacy_model(language: str): def extract_entities(text_column, format: bool, language: str): # Tag sentences nlp = get_spacy_model(language=language) - docs = nlp.pipe(text_column.values) - + docs = nlp.pipe(text_column.values, n_process=-1, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"], batch_size=100) # Extract entities rows = [] for doc in docs: From bce1bba7d645bc261c2cd0eac70a92174272c166 Mon Sep 17 00:00:00 2001 From: Nicolas Dalsass Date: Wed, 6 Jul 2022 16:36:53 +0200 Subject: [PATCH 4/5] Better sudachipy version --- code-env/python/spec/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 51e9830..a160eb4 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -4,7 +4,7 @@ gensim==3.8.0 numpy==1.19.5 spacy[ja]==3.3.0 tokenizers==0.10.3; python_version == '3.6' -sudachipy==0.5.4; python_version == '3.6' +sudachipy==0.6.0; python_version == '3.6' tqdm==4.50.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz From e6a0cdd6e59af4a28c19feb0e5294fb0898bfdae Mon Sep 17 00:00:00 2001 From: Nicolas Dalsass Date: Tue, 26 Jul 2022 09:00:51 +0200 Subject: [PATCH 5/5] Post review --- python-lib/ner_utils_spacy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py index c8d4e98..df66247 100644 --- a/python-lib/ner_utils_spacy.py +++ b/python-lib/ner_utils_spacy.py @@ -22,7 +22,7 @@ def get_spacy_model(language: str): raise ValueError(f"The language {language} is not available. \ You can add the language & corresponding model name by editing the code.") try: - nlp = spacy.load(language_model) + nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) except OSError: # Raising ValueError instead of OSError so it shows up at the top of the log raise ValueError(f"Could not find spaCy model for the language {language}. \ @@ -32,7 +32,7 @@ def get_spacy_model(language: str): def extract_entities(text_column, format: bool, language: str): # Tag sentences nlp = get_spacy_model(language=language) - docs = nlp.pipe(text_column.values, n_process=-1, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"], batch_size=100) + docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100) # Extract entities rows = [] for doc in docs: