explosion · rmitsch · Aug 25, 2023 · Jun 15, 2023 · Jun 15, 2023 · Jun 20, 2023
diff --git a/spacy_llm/tasks/__init__.py b/spacy_llm/tasks/__init__.py
@@ -1,5 +1,5 @@
 from .lemma import LemmaTask, make_lemma_task
-from .ner import NERTask, make_ner_task, make_ner_task_v2
+from .ner import NERTask, make_ner_task_v3
 from .noop import NoopTask, make_noop_task
 from .rel import RELTask, make_rel_task
 from .sentiment import SentimentTask, make_sentiment_task
@@ -9,8 +9,7 @@
 
 __all__ = [
     "make_lemma_task",
-    "make_ner_task",
-    "make_ner_task_v2",
+    "make_ner_task_v3",
     "make_noop_task",
     "make_rel_task",
     "make_sentiment_task",

diff --git a/spacy_llm/tasks/ner.py b/spacy_llm/tasks/ner.py
@@ -1,5 +1,6 @@
+import warnings
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 from spacy.language import Language
 from spacy.scorer import get_ner_prf
@@ -11,69 +12,31 @@
 from ..registry import registry
 from ..ty import ExamplesConfigType
 from ..util import split_labels
-from .span import SpanExample, SpanTask
+from .span import SpanExample, SpanReason, SpanTask
 from .templates import read_template
 
 _DEFAULT_NER_TEMPLATE_V1 = read_template("ner")
 _DEFAULT_NER_TEMPLATE_V2 = read_template("ner.v2")
+_DEFAULT_NER_TEMPLATE_V3 = read_template("ner.v3")
 
 
-@registry.llm_tasks("spacy.NER.v1")
-def make_ner_task(
-    labels: str = "",
-    examples: Optional[Callable[[], Iterable[Any]]] = None,
-    normalizer: Optional[Callable[[str], str]] = None,
-    alignment_mode: Literal["strict", "contract", "expand"] = "contract",
-    case_sensitive_matching: bool = False,
-    single_match: bool = False,
-):
-    """NER.v1 task factory.
-
-    labels (str): Comma-separated list of labels to pass to the template.
-        Leave empty to populate it at initialization time (only if examples are provided).
-    template (str): Prompt template passed to the model.
-    label_definitions (Optional[Dict[str, str]]): Map of label -> description
-        of the label to help the language model output the entities wanted.
-        It is usually easier to provide these definitions rather than
-        full examples, although both can be provided.
-    examples (Optional[Callable[[], Iterable[Any]]]): Optional callable that
-        reads a file containing task examples for few-shot learning. If None is
-        passed, then zero-shot learning will be used.
-    normalizer (Optional[Callable[[str], str]]): optional normalizer function.
-    alignment_mode (str): "strict", "contract" or "expand".
-    case_sensitive: Whether to search without case sensitivity.
-    single_match (bool): If False, allow one substring to match multiple times in
-        the text. If True, returns the first hit.
-    """
-    labels_list = split_labels(labels)
-
-    span_examples = (
-        [SpanExample(**eg) for eg in examples()] if callable(examples) else examples
-    )
-    return NERTask(
-        labels=labels_list,
-        template=_DEFAULT_NER_TEMPLATE_V1,
-        prompt_examples=span_examples,
-        normalizer=normalizer,
-        alignment_mode=alignment_mode,
-        case_sensitive_matching=case_sensitive_matching,
-        single_match=single_match,
-    )
-
-
-@registry.llm_tasks("spacy.NER.v2")
-def make_ner_task_v2(
+@registry.llm_tasks("spacy.NER.v3")
+def make_ner_task_v3(
+    examples: ExamplesConfigType,
     labels: Union[List[str], str] = [],
-    template: str = _DEFAULT_NER_TEMPLATE_V2,
+    template: str = _DEFAULT_NER_TEMPLATE_V3,
+    description: Optional[str] = None,
     label_definitions: Optional[Dict[str, str]] = None,
-    examples: ExamplesConfigType = None,
     normalizer: Optional[Callable[[str], str]] = None,
     alignment_mode: Literal["strict", "contract", "expand"] = "contract",
     case_sensitive_matching: bool = False,
     single_match: bool = False,
 ):
-    """NER.v2 task factory.
+    """NER.v3 task factory.
 
+    examples (Union[Callable[[], Iterable[OTS]]]): Optional callable that
+        reads a file containing task examples for few-shot learning. If None is
+        passed, then zero-shot learning will be used.
     labels (Union[str, List[str]]): List of labels to pass to the template,
         either an actual list or a comma-separated string.
         Leave empty to populate it at initialization time (only if examples are provided).
@@ -82,9 +45,6 @@ def make_ner_task_v2(
         of the label to help the language model output the entities wanted.
         It is usually easier to provide these definitions rather than
         full examples, although both can be provided.
-    examples (Optional[Callable[[], Iterable[Any]]]): Optional callable that
-        reads a file containing task examples for few-shot learning. If None is
-        passed, then zero-shot learning will be used.
     normalizer (Optional[Callable[[str], str]]): optional normalizer function.
     alignment_mode (str): "strict", "contract" or "expand".
     case_sensitive: Whether to search without case sensitivity.
@@ -93,11 +53,16 @@ def make_ner_task_v2(
     """
     labels_list = split_labels(labels)
     raw_examples = examples() if callable(examples) else examples
-    span_examples = [SpanExample(**eg) for eg in raw_examples] if raw_examples else None
+    span_examples = [SpanExample(**eg) for eg in raw_examples]
+    if not description:
+        description = (
+            f"Entities must take one of these labels: {', '.join(labels_list)}."
+        )
 
     return NERTask(
         labels=labels_list,
         template=template,
+        description=description,
         label_definitions=label_definitions,
         prompt_examples=span_examples,
         normalizer=normalizer,
@@ -110,36 +75,20 @@ def make_ner_task_v2(
 class NERTask(SpanTask):
     def __init__(
         self,
-        labels: List[str] = [],
-        template: str = _DEFAULT_NER_TEMPLATE_V2,
-        label_definitions: Optional[Dict[str, str]] = None,
+        labels: List[str],
+        template: str,
+        description: Optional[str] = None,
         prompt_examples: Optional[List[SpanExample]] = None,
+        label_definitions: Optional[Dict[str, str]] = None,
         normalizer: Optional[Callable[[str], str]] = None,
         alignment_mode: Literal["strict", "contract", "expand"] = "contract",
         case_sensitive_matching: bool = False,
         single_match: bool = False,
     ):
-        """Default NER task.
-
-        labels (List[str]): List of labels to pass to the template.
-            Leave empty to populate it at initialization time (only if examples are provided).
-        template (str): Prompt template passed to the model.
-        label_definitions (Optional[Dict[str, str]]): Map of label -> description
-            of the label to help the language model output the entities wanted.
-            It is usually easier to provide these definitions rather than
-            full examples, although both can be provided.
-        examples (Optional[Callable[[], Iterable[Any]]]): Optional callable that
-            reads a file containing task examples for few-shot learning. If None is
-            passed, then zero-shot learning will be used.
-        normalizer (Optional[Callable[[str], str]]): optional normalizer function.
-        alignment_mode (str): "strict", "contract" or "expand".
-        case_sensitive: Whether to search without case sensitivity.
-        single_match (bool): If False, allow one substring to match multiple times in
-            the text. If True, returns the first hit.
-        """
-        super().__init__(
+        super(NERTask, self).__init__(
             labels=labels,
             template=template,
+            description=description,
             label_definitions=label_definitions,
             prompt_examples=prompt_examples,
             normalizer=normalizer,
@@ -153,7 +102,6 @@ def initialize(
         get_examples: Callable[[], Iterable["Example"]],
         nlp: Language,
         labels: List[str] = [],
-        n_prompt_examples: int = 0,
         **kwargs: Any,
     ) -> None:
         """Initialize the NER task, by auto-discovering labels.
@@ -168,26 +116,22 @@ def initialize(
             for initialization.
         nlp (Language): Language instance.
         labels (List[str]): Optional list of labels.
-        n_prompt_examples (int): How many prompt examples to infer from the Example objects.
-            0 by default. Takes all examples if set to -1.
         """
+
+        examples = get_examples()
+
         if not labels:
             labels = list(self._label_dict.values())
-        infer_labels = not labels
 
-        if infer_labels:
-            labels = []
+        if not labels:
+            label_set = set()
 
-        for eg in get_examples():
-            if infer_labels:
+            for eg in examples:
                 for ent in eg.reference.ents:
-                    labels.append(ent.label_)
-            if n_prompt_examples < 0 or len(self._prompt_examples) < n_prompt_examples:
-                self._prompt_examples.append(self._create_prompt_example(eg))
+                    label_set.add(ent.label_)
+            labels = list(label_set)
 
-        self._label_dict = {
-            self._normalizer(label): label for label in sorted(set(labels))
-        }
+        self._label_dict = {self._normalizer(label): label for label in labels}
 
     def assign_spans(
         self,
@@ -202,11 +146,3 @@ def scorer(
         examples: Iterable[Example],
     ) -> Dict[str, Any]:
         return get_ner_prf(examples)
-
-    def _create_prompt_example(self, example: Example) -> SpanExample:
-        """Create an NER prompt example from a spaCy example."""
-        entities = defaultdict(list)
-        for ent in example.reference.ents:
-            entities[ent.label_].append(ent.text)
-
-        return SpanExample(text=example.reference.text, entities=entities)