Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions code-env/python/spec/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ flask>=2.0,<2.1
gensim==3.8.0
numpy==1.19.5
spacy[ja]==3.3.0
tokenizers==0.10.3; python_version == '3.6'
sudachipy==0.6.0; python_version == '3.6'
tqdm==4.50.0
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
# https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0.tar.gz
Expand Down
8 changes: 7 additions & 1 deletion custom-recipes/named-entity-recognition-extract/recipe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import multiprocessing

import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config

Expand Down Expand Up @@ -57,7 +59,11 @@ def compute_entities_df(df):
out_df = df.merge(out_df, left_index=True, right_index=True)
return out_df

if ner_model == "spacy":
chunksize = 200 * multiprocessing.cpu_count()
else:
chunksize = 100

process_dataset_chunks(
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=100
input_dataset=input_dataset, output_dataset=output_dataset, func=compute_entities_df, chunksize=chunksize
)
5 changes: 2 additions & 3 deletions python-lib/ner_utils_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_spacy_model(language: str):
raise ValueError(f"The language {language} is not available. \
You can add the language & corresponding model name by editing the code.")
try:
nlp = spacy.load(language_model)
nlp = spacy.load(language_model, exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
except OSError:
# Raising ValueError instead of OSError so it shows up at the top of the log
raise ValueError(f"Could not find spaCy model for the language {language}. \
Expand All @@ -32,8 +32,7 @@ def get_spacy_model(language: str):
def extract_entities(text_column, format: bool, language: str):
# Tag sentences
nlp = get_spacy_model(language=language)
docs = nlp.pipe(text_column.values)

docs = nlp.pipe(text_column.values, n_process=-1, batch_size=100)
# Extract entities
rows = []
for doc in docs:
Expand Down