Add scorer option to return per-component scores (#12540)

* Add scorer option to return per-component scores Add `per_component` option to `Language.evaluate` and `Scorer.score` to return scores keyed by `tokenizer` (hard-coded) or by component name. Add option to `evaluate` CLI to score by component. Per-component scores can only be saved to JSON. * Update help text and messages
explosion · May 12, 2023 · 3637148 · 3637148
1 parent 88680a6
commit 3637148
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 67 deletions.
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
@@ -27,6 +27,7 @@ def evaluate_cli(
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
     # fmt: on
 ):
     """
@@ -50,6 +51,7 @@ def evaluate_cli(
         gold_preproc=gold_preproc,
         displacy_path=displacy_path,
         displacy_limit=displacy_limit,
+        per_component=per_component,
         silent=False,
     )
 
@@ -64,6 +66,7 @@ def evaluate(
     displacy_limit: int = 25,
     silent: bool = True,
     spans_key: str = "sc",
+    per_component: bool = False,
 ) -> Dict[str, Any]:
     msg = Printer(no_print=silent, pretty=not silent)
     fix_random_seed()
@@ -78,44 +81,53 @@ def evaluate(
     corpus = Corpus(data_path, gold_preproc=gold_preproc)
     nlp = util.load_model(model)
     dev_dataset = list(corpus(nlp))
-    scores = nlp.evaluate(dev_dataset)
-    metrics = {
-        "TOK": "token_acc",
-        "TAG": "tag_acc",
-        "POS": "pos_acc",
-        "MORPH": "morph_acc",
-        "LEMMA": "lemma_acc",
-        "UAS": "dep_uas",
-        "LAS": "dep_las",
-        "NER P": "ents_p",
-        "NER R": "ents_r",
-        "NER F": "ents_f",
-        "TEXTCAT": "cats_score",
-        "SENT P": "sents_p",
-        "SENT R": "sents_r",
-        "SENT F": "sents_f",
-        "SPAN P": f"spans_{spans_key}_p",
-        "SPAN R": f"spans_{spans_key}_r",
-        "SPAN F": f"spans_{spans_key}_f",
-        "SPEED": "speed",
-    }
-    results = {}
-    data = {}
-    for metric, key in metrics.items():
-        if key in scores:
-            if key == "cats_score":
-                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
-            if isinstance(scores[key], (int, float)):
-                if key == "speed":
-                    results[metric] = f"{scores[key]:.0f}"
+    scores = nlp.evaluate(dev_dataset, per_component=per_component)
+    if per_component:
+        data = scores
+        if output is None:
+            msg.warn(
+                "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+            )
+        else:
+            msg.info("Per-component scores will be saved to output JSON file.")
+    else:
+        metrics = {
+            "TOK": "token_acc",
+            "TAG": "tag_acc",
+            "POS": "pos_acc",
+            "MORPH": "morph_acc",
+            "LEMMA": "lemma_acc",
+            "UAS": "dep_uas",
+            "LAS": "dep_las",
+            "NER P": "ents_p",
+            "NER R": "ents_r",
+            "NER F": "ents_f",
+            "TEXTCAT": "cats_score",
+            "SENT P": "sents_p",
+            "SENT R": "sents_r",
+            "SENT F": "sents_f",
+            "SPAN P": f"spans_{spans_key}_p",
+            "SPAN R": f"spans_{spans_key}_r",
+            "SPAN F": f"spans_{spans_key}_f",
+            "SPEED": "speed",
+        }
+        results = {}
+        data = {}
+        for metric, key in metrics.items():
+            if key in scores:
+                if key == "cats_score":
+                    metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+                if isinstance(scores[key], (int, float)):
+                    if key == "speed":
+                        results[metric] = f"{scores[key]:.0f}"
+                    else:
+                        results[metric] = f"{scores[key]*100:.2f}"
                 else:
-                    results[metric] = f"{scores[key]*100:.2f}"
-            else:
-                results[metric] = "-"
-            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+                    results[metric] = "-"
+                data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
 
-    msg.table(results, title="Results")
-    data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+        msg.table(results, title="Results")
+        data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
 
     if displacy_path:
         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]

diff --git a/spacy/language.py b/spacy/language.py
@@ -1372,6 +1372,7 @@ def evaluate(
         scorer: Optional[Scorer] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         scorer_cfg: Optional[Dict[str, Any]] = None,
+        per_component: bool = False,
     ) -> Dict[str, Any]:
         """Evaluate a model's pipeline components.
 
@@ -1383,6 +1384,8 @@ def evaluate(
             arguments for specific components.
         scorer_cfg (dict): An optional dictionary with extra keyword arguments
             for the scorer.
+        per_component (bool): Whether to return the scores keyed by component
+            name. Defaults to False.
 
         RETURNS (Scorer): The scorer containing the evaluation results.
 
@@ -1415,7 +1418,7 @@ def evaluate(
         for eg, doc in zip(examples, docs):
             eg.predicted = doc
         end_time = timer()
-        results = scorer.score(examples)
+        results = scorer.score(examples, per_component=per_component)
         n_words = sum(len(eg.predicted) for eg in examples)
         results["speed"] = n_words / (end_time - start_time)
         return results

diff --git a/spacy/scorer.py b/spacy/scorer.py
@@ -121,20 +121,30 @@ def __init__(
                 nlp.add_pipe(pipe)
             self.nlp = nlp
 
-    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
+    def score(
+        self, examples: Iterable[Example], *, per_component: bool = False
+    ) -> Dict[str, Any]:
         """Evaluate a list of Examples.
 
         examples (Iterable[Example]): The predicted annotations + correct annotations.
+        per_component (bool): Whether to return the scores keyed by component
+            name. Defaults to False.
         RETURNS (Dict): A dictionary of scores.
 
         DOCS: https://spacy.io/api/scorer#score
         """
         scores = {}
         if hasattr(self.nlp.tokenizer, "score"):
-            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))  # type: ignore
+            if per_component:
+                scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
+            else:
+                scores.update(self.nlp.tokenizer.score(examples, **self.cfg))  # type: ignore
         for name, component in self.nlp.pipeline:
             if hasattr(component, "score"):
-                scores.update(component.score(examples, **self.cfg))
+                if per_component:
+                    scores[name] = component.score(examples, **self.cfg)
+                else:
+                    scores.update(component.score(examples, **self.cfg))
         return scores
 
     @staticmethod

diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
@@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
     assert scores["token_r"] == approx(0.33333333)
     assert scores["token_f"] == 0.4
 
+    # per-component scoring
+    scorer = Scorer()
+    scores = scorer.score([example], per_component=True)
+    assert scores["tokenizer"]["token_acc"] == 0.5
+    assert scores["tokenizer"]["token_p"] == 0.5
+    assert scores["tokenizer"]["token_r"] == approx(0.33333333)
+    assert scores["tokenizer"]["token_f"] == 0.4
+
 
 def test_sents(sented_doc):
     scorer = Scorer()
@@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
     assert results["morph_per_feat"]["Poss"]["f"] == 0.0
     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
 
+    # per-component scoring
+    scorer = Scorer()
+    results = scorer.score([example], per_component=True)
+    assert results["tagger"]["tag_acc"] == 0.9
+    assert results["morphologizer"]["pos_acc"] == 0.9
+    assert results["morphologizer"]["morph_acc"] == approx(0.8)
+
 
 def test_partial_annotation(en_tokenizer):
     pred_doc = en_tokenizer("a b c d e")

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
@@ -1163,18 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```
 
-| Name                                      | Description                                                                                                                                                                          |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                   | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                               | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                          | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                    | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                  | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                 | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                          |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 

diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
@@ -382,15 +382,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
 > print(scores)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `batch_size`    | The batch size to use. ~~Optional[int]~~                                                                                                       |
-| `scorer`        | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~                                     |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| `scorer_cfg`    | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~                                    |
-| **RETURNS**     | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                                               |
+| Name                                         | Description                                                                                                                                    |
+| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`                                   | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
+| _keyword-only_                               |                                                                                                                                                |
+| `batch_size`                                 | The batch size to use. ~~Optional[int]~~                                                                                                       |
+| `scorer`                                     | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~                                     |
+| `component_cfg`                              | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
+| `scorer_cfg`                                 | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~                                    |
+| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~                                                            |
+| **RETURNS**                                  | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~                                                               |
 
 ## Language.use_params {id="use_params",tag="contextmanager, method"}