From 951ee70d474c7caa96fce0b640268c57126098f6 Mon Sep 17 00:00:00 2001
From: admin <admin@localhost>
Date: Tue, 21 Jun 2022 09:04:23 +0200
Subject: [PATCH 1/5] Edited files

---
 code-env/python/spec/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
index 28e84b1..302e167 100644
--- a/code-env/python/spec/requirements.txt
+++ b/code-env/python/spec/requirements.txt
@@ -3,6 +3,7 @@ flask>=2.0,<2.1
 gensim==3.8.0
 numpy==1.19.5
 spacy[ja]==3.3.0
+tokenizers==0.10.3; python_version == '3.6'
 tqdm==4.50.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
 # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz

From b07f4ed99a7a6f1625c2f9458257a2c25a965152 Mon Sep 17 00:00:00 2001
From: admin <admin@localhost>
Date: Tue, 21 Jun 2022 09:17:31 +0200
Subject: [PATCH 2/5] Edited file 'code-env/python/spec/requirements.txt'

---
 code-env/python/spec/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
index 302e167..51e9830 100644
--- a/code-env/python/spec/requirements.txt
+++ b/code-env/python/spec/requirements.txt
@@ -4,6 +4,7 @@ gensim==3.8.0
 numpy==1.19.5
 spacy[ja]==3.3.0
 tokenizers==0.10.3; python_version == '3.6'
+sudachipy==0.5.4; python_version == '3.6'
 tqdm==4.50.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
 # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz

From 9fe034f7e2ccf4c3f687f090f3f9a57be103a004 Mon Sep 17 00:00:00 2001
From: Nicolas Dalsass <nicolas.dalsass@dataiku.com>
Date: Tue, 21 Jun 2022 10:57:08 +0200
Subject: [PATCH 3/5] Spacy optimizations

- Disable unused pipeline algos - divides recipe time roughly by two
- Allow multi-cpu processing - on a 8 core machine, divives recipe time roughly by 3

Overall, 6x improvement for spacy recipe !
---
 custom-recipes/named-entity-recognition-extract/recipe.py | 8 +++++++-
 python-lib/ner_utils_spacy.py                             | 3 +--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/custom-recipes/named-entity-recognition-extract/recipe.py b/custom-recipes/named-entity-recognition-extract/recipe.py
index 0ed3f87..43acb16 100644
--- a/custom-recipes/named-entity-recognition-extract/recipe.py
+++ b/custom-recipes/named-entity-recognition-extract/recipe.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import multiprocessing
+
 import dataiku
 from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 
@@ -57,7 +59,11 @@ def compute_entities_df(df):
     out_df = df.merge(out_df, left_index=True, right_index=True)
     return out_df
 
+if ner_model == "spacy":
+    chunksize = 200 * multiprocessing.cpu_count()
+else:
+    chunksize = 100
 
 process_dataset_chunks(
-    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
+    input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
 )
diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py
index 6bbd619..c8d4e98 100644
--- a/python-lib/ner_utils_spacy.py
+++ b/python-lib/ner_utils_spacy.py
@@ -32,8 +32,7 @@ def get_spacy_model(language: str):
 def extract_entities(text_column, format: bool, language: str):
     # Tag sentences
     nlp = get_spacy_model(language=language)
-    docs = nlp.pipe(text_column.values)
-
+    docs = nlp.pipe(text_column.values, n_process=-1, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"], batch_size=100)
     # Extract entities
     rows = []
     for doc in docs:

From bce1bba7d645bc261c2cd0eac70a92174272c166 Mon Sep 17 00:00:00 2001
From: Nicolas Dalsass <nicolasdalsass@users.noreply.github.com>
Date: Wed, 6 Jul 2022 16:36:53 +0200
Subject: [PATCH 4/5] Better sudachipy version

---
 code-env/python/spec/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
index 51e9830..a160eb4 100644
--- a/code-env/python/spec/requirements.txt
+++ b/code-env/python/spec/requirements.txt
@@ -4,7 +4,7 @@ gensim==3.8.0
 numpy==1.19.5
 spacy[ja]==3.3.0
 tokenizers==0.10.3; python_version == '3.6'
-sudachipy==0.5.4; python_version == '3.6'
+sudachipy==0.6.0; python_version == '3.6'
 tqdm==4.50.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
 # https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz

From e6a0cdd6e59af4a28c19feb0e5294fb0898bfdae Mon Sep 17 00:00:00 2001
From: Nicolas Dalsass <nicolas.dalsass@dataiku.com>
Date: Tue, 26 Jul 2022 09:00:51 +0200
Subject: [PATCH 5/5] Post review

---
 python-lib/ner_utils_spacy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-lib/ner_utils_spacy.py b/python-lib/ner_utils_spacy.py
index c8d4e98..df66247 100644
--- a/python-lib/ner_utils_spacy.py
+++ b/python-lib/ner_utils_spacy.py
@@ -22,7 +22,7 @@ def get_spacy_model(language: str):
         raise ValueError(f"The language {language} is not available. \
                         You can add the language & corresponding model name by editing the code.")
     try:
-        nlp = spacy.load(language_model)
+        nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
     except OSError:
         # Raising ValueError instead of OSError so it shows up at the top of the log
         raise ValueError(f"Could not find spaCy model for the language {language}. \
@@ -32,7 +32,7 @@ def get_spacy_model(language: str):
 def extract_entities(text_column, format: bool, language: str):
     # Tag sentences
     nlp = get_spacy_model(language=language)
-    docs = nlp.pipe(text_column.values, n_process=-1, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"], batch_size=100)
+    docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
     # Extract entities
     rows = []
     for doc in docs: