dataiku · nicolasdalsass · Jul 27, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 21, 2022
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -3,6 +3,8 @@ flask>=2.0,<2.1
 gensim==3.8.0
 numpy==1.19.5
 spacy[ja]==3.3.0
+tokenizers==0.10.3; python_version == '3.6'
+sudachipy==0.6.0; python_version == '3.6'
 tqdm==4.50.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
 # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz

diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import multiprocessing
+
 import dataiku
 from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 
@@ -57,7 +59,11 @@ def compute_entities_df(df):
     out_df = df.merge(out_df, left_index=True, right_index=True)
     return out_df
 
+if ner_model == "spacy":
+    chunksize = 200 * multiprocessing.cpu_count()
+else:
+    chunksize = 100
 
 process_dataset_chunks(
-    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
+    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
 )
diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py
@@ -22,7 +22,7 @@ def get_spacy_model(language: str):
         raise ValueError(f"The language {language} is not available. \
                         You can add the language & corresponding model name by editing the code.")
     try:
-        nlp = spacy.load(language_model)
+        nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
     except OSError:
         # Raising ValueError instead of OSError so it shows up at the top of the log
         raise ValueError(f"Could not find spaCy model for the language {language}. \
@@ -32,8 +32,7 @@ def get_spacy_model(language: str):
 def extract_entities(text_column, format: bool, language: str):
     # Tag sentences
     nlp = get_spacy_model(language=language)
-    docs = nlp.pipe(text_column.values)
-
+    docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
     # Extract entities
     rows = []
     for doc in docs: