explosion · rmitsch · Aug 25, 2023 · Jun 15, 2023 · Jun 15, 2023 · Jun 20, 2023
diff --git a/spacy_llm/tasks/ner.py b/spacy_llm/tasks/ner.py
@@ -1,5 +1,6 @@
+import warnings
 from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 from spacy.language import Language
 from spacy.scorer import get_ner_prf
@@ -9,13 +10,14 @@
 
 from ..compat import Literal
 from ..registry import registry
-from ..ty import ExamplesConfigType
+from ..ty import COTExamplesConfigType, ExamplesConfigType
 from ..util import split_labels
-from .span import SpanExample, SpanTask
+from .span import COTSpanExample, SpanExample, SpanReason, SpanTask
 from .templates import read_template
 
 _DEFAULT_NER_TEMPLATE_V1 = read_template("ner")
 _DEFAULT_NER_TEMPLATE_V2 = read_template("ner.v2")
+_DEFAULT_NER_TEMPLATE_V3 = read_template("ner.v3")
 
 
 @registry.llm_tasks("spacy.NER.v1")
@@ -107,7 +109,55 @@ def make_ner_task_v2(
     )
 
 
-class NERTask(SpanTask):
+@registry.llm_tasks("spacy.NER.v3")
+def make_ner_task_v3(
+    examples: COTExamplesConfigType,
+    description: str,
+    labels: Union[List[str], str] = [],
+    template: str = _DEFAULT_NER_TEMPLATE_V3,
+    label_definitions: Optional[Dict[str, str]] = None,
+    normalizer: Optional[Callable[[str], str]] = None,
+    alignment_mode: Literal["strict", "contract", "expand"] = "contract",
+    case_sensitive_matching: bool = False,
+    single_match: bool = False,
+):
+    """NER.v3 task factory.
+
+    examples (Union[Callable[[], Iterable[COTS]]]): Optional callable that
+        reads a file containing task examples for few-shot learning. If None is
+        passed, then zero-shot learning will be used.
+    labels (Union[str, List[str]]): List of labels to pass to the template,
+        either an actual list or a comma-separated string.
+        Leave empty to populate it at initialization time (only if examples are provided).
+    template (str): Prompt template passed to the model.
+    label_definitions (Optional[Dict[str, str]]): Map of label -> description
+        of the label to help the language model output the entities wanted.
+        It is usually easier to provide these definitions rather than
+        full examples, although both can be provided.
+    normalizer (Optional[Callable[[str], str]]): optional normalizer function.
+    alignment_mode (str): "strict", "contract" or "expand".
+    case_sensitive: Whether to search without case sensitivity.
+    single_match (bool): If False, allow one substring to match multiple times in
+        the text. If True, returns the first hit.
+    """
+    labels_list = split_labels(labels)
+    raw_examples = examples() if callable(examples) else examples
+    span_examples = [COTSpanExample(**eg) for eg in raw_examples]
+
+    return COTNERTask(
+        labels=labels_list,
+        template=template,
+        description=description,
+        label_definitions=label_definitions,
+        prompt_examples=span_examples,
+        normalizer=normalizer,
+        alignment_mode=alignment_mode,
+        case_sensitive_matching=case_sensitive_matching,
+        single_match=single_match,
+    )
+
+
+class NERTask(SpanTask[SpanExample]):
     def __init__(
         self,
         labels: List[str] = [],
@@ -203,10 +253,108 @@ def scorer(
     ) -> Dict[str, Any]:
         return get_ner_prf(examples)
 
+    @property
+    def _Example(self) -> type[SpanExample]:
+        return SpanExample
+
     def _create_prompt_example(self, example: Example) -> SpanExample:
         """Create an NER prompt example from a spaCy example."""
         entities = defaultdict(list)
         for ent in example.reference.ents:
             entities[ent.label_].append(ent.text)
 
         return SpanExample(text=example.reference.text, entities=entities)
+
+
+class COTNERTask(SpanTask[COTSpanExample]):
+    def __init__(
+        self,
+        labels: List[str],
+        template: str,
+        description: Optional[str] = None,
+        prompt_examples: Optional[List[COTSpanExample]] = None,
+        label_definitions: Optional[Dict[str, str]] = None,
+        normalizer: Optional[Callable[[str], str]] = None,
+        alignment_mode: Literal["strict", "contract", "expand"] = "contract",
+        case_sensitive_matching: bool = False,
+        single_match: bool = False,
+    ):
+        super().__init__(
+            labels=labels,
+            template=template,
+            description=description,
+            label_definitions=label_definitions,
+            prompt_examples=prompt_examples,
+            normalizer=normalizer,
+            alignment_mode=alignment_mode,
+            case_sensitive_matching=case_sensitive_matching,
+            single_match=single_match,
+        )
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable["Example"]],
+        nlp: Language,
+        labels: List[str] = [],
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the NER task, by auto-discovering labels.
+
+        Labels can be set through, by order of precedence:
+
+        - the `[initialize]` section of the pipeline configuration
+        - the `labels` argument supplied to the task factory
+        - the labels found in the examples
+
+        get_examples (Callable[[], Iterable["Example"]]): Callable that provides examples
+            for initialization.
+        nlp (Language): Language instance.
+        labels (List[str]): Optional list of labels.
+        """
+
+        examples = get_examples()
+
+        if not labels:
+            labels = list(self._label_dict.values())
+
+        if not labels:
+            label_set = set()
+
+            for eg in examples:
+                for ent in eg.reference.ents:
+                    label_set.add(ent.label_)
+            labels = list(label_set)
+
+        self._label_dict = {self._normalizer(label): label for label in labels}
+
+    def _format_response(self, response: str) -> Iterable[Tuple[str, Iterable[str]]]:
+        """Parse raw string response into a structured format"""
+        output: dict[str, list[str]] = defaultdict(list)
+        assert self._normalizer is not None
+        for line in response.strip().split("\n"):
+            entity = SpanReason.from_str(line)
+            if entity:
+                norm_label = self._normalizer(entity.label)
+                if norm_label not in self._label_dict:
+                    continue
+                label = self._label_dict[norm_label]
+                output[label].append(entity.text)
+        return output.items()
+
+    def assign_spans(
+        self,
+        doc: Doc,
+        spans: List[Span],
+    ) -> None:
+        """Assign spans to the document."""
+        doc.set_ents(filter_spans(spans))
+
+    def scorer(
+        self,
+        examples: Iterable[Example],
+    ) -> Dict[str, Any]:
+        return get_ner_prf(examples)
+
+    @property
+    def _Example(self) -> type[COTSpanExample]:
+        return COTSpanExample
diff --git a/spacy_llm/tasks/span.py b/spacy_llm/tasks/span.py
@@ -1,5 +1,6 @@
+import typing
 import warnings
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Type
 
 import jinja2
 from pydantic import BaseModel
@@ -11,20 +12,56 @@
 from .util.serialization import SerializableTask
 
 
+class SpanReason(BaseModel):
+    text: str
+    is_entity: bool
+    label: str
+    reason: str
+
+    @classmethod
+    def from_str(cls, s: str, sep: str = "|"):
+        clean_str = s.strip()
+        if "." in clean_str:
+            clean_str = clean_str.split(".", maxsplit=1)[1]
+        components = [c.strip() for c in clean_str.split(sep)]
+        if len(components) == 4:
+            return cls(
+                text=components[0],
+                is_entity=components[1].lower() == "true",
+                label=components[2],
+                reason=components[3],
+            )
+
+    def __str__(self) -> str:
+        return self.to_str()
+
+    def to_str(self) -> str:
+        return f"{self.text} | {self.is_entity} | {self.label} | {self.reason}"
+
+
 class SpanExample(BaseModel):
     text: str
     entities: Dict[str, List[str]]
 
 
-class SpanTask(SerializableTask[SpanExample]):
+class COTSpanExample(BaseModel):
+    text: str
+    entities: List[SpanReason]
+
+
+_PromptExampleT = TypeVar("_PromptExampleT", SpanExample, COTSpanExample)
+
+
+class SpanTask(SerializableTask[_PromptExampleT]):
     """Base class for Span-related tasks, eg NER and SpanCat."""
 
     def __init__(
         self,
         labels: List[str],
         template: str,
-        label_definitions: Optional[Dict[str, str]] = {},
-        prompt_examples: Optional[List[SpanExample]] = None,
+        description: Optional[str] = None,
+        label_definitions: Optional[Dict[str, str]] = None,
+        prompt_examples: Optional[List[_PromptExampleT]] = None,
         normalizer: Optional[Callable[[str], str]] = None,
         alignment_mode: Literal[
             "strict", "contract", "expand"  # noqa: F821
@@ -37,6 +74,7 @@ def __init__(
             self._normalizer(label): label for label in sorted(set(labels))
         }
         self._template = template
+        self._description = description
         self._label_definitions = label_definitions
         self._prompt_examples = prompt_examples or []
         self._validate_alignment(alignment_mode)
@@ -47,16 +85,33 @@ def __init__(
         if self._prompt_examples:
             self._prompt_examples = self._check_label_consistency()
 
-    def _check_label_consistency(self) -> List[SpanExample]:
+    @property
+    def labels(self) -> Tuple[str, ...]:
+        return tuple(self._label_dict.values())
+
+    @property
+    def prompt_template(self) -> str:
+        return self._template
+
+    def _check_label_consistency(self) -> List[_PromptExampleT]:
         """Checks consistency of labels between examples and defined labels. Emits warning on inconsistency.
         RETURNS (List[SpanExample]): List of SpanExamples with valid labels.
         """
         assert self._prompt_examples
-        example_labels = {
-            self._normalizer(key): key
-            for example in self._prompt_examples
-            for key in example.entities
-        }
+        if isinstance(self._prompt_examples[0], SpanExample):
+            example_labels = {
+                self._normalizer(key): key
+                for example in self._prompt_examples
+                for key in example.entities
+            }
+        else:
+            example_labels = {
+                self._normalizer(key.label): key.label
+                for example in self._prompt_examples
+                for key in example.entities
+                if key.is_entity
+            }
+
         unspecified_labels = {
             example_labels[key]
             for key in (set(example_labels.keys()) - set(self._label_dict.keys()))
@@ -70,36 +125,39 @@ def _check_label_consistency(self) -> List[SpanExample]:
             )
 
         # Return examples without non-declared labels. If an example only has undeclared labels, it is discarded.
-        return [
-            example
-            for example in [
-                SpanExample(
+        examples = []
+        for example in self._prompt_examples:
+            if isinstance(self._prompt_examples[0], SpanExample):
+                span_example = SpanExample(
                     text=example.text,
                     entities={
                         label: entities
                         for label, entities in example.entities.items()
                         if self._normalizer(label) in self._label_dict
                     },
                 )
-                for example in self._prompt_examples
-            ]
-            if len(example.entities)
-        ]
+            else:
+                span_example = COTSpanExample(
+                    text=example.text,
+                    entities=[
+                        entity
+                        for entity in example.entities
+                        if self._normalizer(entity.label) in self._label_dict
+                    ],
+                )
 
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        return tuple(self._label_dict.values())
+            if len(span_example.entities):
+                examples.append(span_example)
 
-    @property
-    def prompt_template(self) -> str:
-        return self._template
+        return examples
 
     def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
         environment = jinja2.Environment()
         _template = environment.from_string(self._template)
         for doc in docs:
             prompt = _template.render(
                 text=doc.text,
+                description=self._description,
                 labels=list(self._label_dict.values()),
                 label_definitions=self._label_definitions,
                 examples=self._prompt_examples,

diff --git a/spacy_llm/tasks/spancat.py b/spacy_llm/tasks/spancat.py
@@ -218,6 +218,10 @@ def _cfg_keys(self) -> List[str]:
             "_single_match",
         ]
 
+    @property
+    def _Example(self) -> type[SpanExample]:
+        return SpanExample
+
     def _create_prompt_example(self, example: Example) -> SpanExample:
         """Create a spancat prompt example from a spaCy example."""
         entities = defaultdict(list)

diff --git a/spacy_llm/tasks/templates/ner.v3.jinja b/spacy_llm/tasks/templates/ner.v3.jinja
@@ -0,0 +1,19 @@
+{{ description }}
+{# whitespace #}
+{# whitespace #}
+Q: Given the paragraph below, identify a list of possible entities, and for each entry explain why it is or is not an entity:
+{# whitespace #}
+{# whitespace #}
+{%- for example in examples -%}
+Paragraph: {{ example.text }}
+Answer:
+{# whitespace #}
+{%- for span in example.entities -%}
+{{ loop.index }}. {{ span.to_str() }}
+{# whitespace #}
+{%- endfor -%}
+{# whitespace #}
+{# whitespace #}
+{%- endfor -%}
+Paragraph: {{ text }}
+Answer: