Tidy up and fix tests

explosion · Oct 2, 2019 · 3f24424 · 3f24424
1 parent 9cc840a
commit 3f24424
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 96 deletions.
diff --git a/tests/.flake8 b/tests/.flake8
@@ -2,3 +2,7 @@
 ignore = E203, E266, E501, W503
 max-line-length = 80
 select = B,C,E,F,W,T4,B9
+exclude =
+    .env,
+    .git,
+    __pycache__,
diff --git a/tests/lang/de/test_parser.py b/tests/lang/de/test_parser.py
@@ -1,41 +1,33 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import os
 import pytest
-# from spacy.tokens import Doc
 from spacy.gold import GoldCorpus
-from spacy import util
+from pathlib import Path
 
-# from ...util import apply_transition_sequence
 
-TEST_FILES_DIR = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'test_files',
-    )
+TEST_FILES_DIR = Path(__file__).parent / "test_files"
 
 
 @pytest.mark.xfail
-@pytest.mark.parametrize(
-    "test_file",
-    [("de_pud-ud-test.stts.json"),]
-)
+@pytest.mark.parametrize("test_file", [("de_pud-ud-test.stts.json")])
 def test_de_parser_depset(NLP, test_file):
     """Check that no tags outside the tagset are used."""
-    gold_deps = set(["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "sp", "svp", "uc", "vo"])
-
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    # fmt: off
+    gold_deps = set(["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd",
+                     "cj", "cm", "cp", "cvc", "da", "dm", "ep", "ju", "mnr",
+                     "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par",
+                     "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs",
+                     "sb", "sbp", "sp", "svp", "uc", "vo"])
+    # fmt: on
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
     dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
-
     pred_deps = set()
-    parser = NLP.get_pipe('parser')
-
+    parser = NLP.get_pipe("parser")
     for doc, _ in dev_docs:
         parser(doc)
         pred_deps = pred_deps.union(set([t.dep_ for t in doc]))
-
     assert len(pred_deps - gold_deps) == 0
diff --git a/tests/lang/de/test_tagger.py b/tests/lang/de/test_tagger.py
@@ -1,25 +1,20 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import os
 import pytest
 from spacy.tokens import Doc
 from spacy.compat import unicode_
 from spacy.parts_of_speech import SPACE
 from spacy.gold import GoldCorpus
-from spacy import util
+from pathlib import Path
 
-# from spacy.lemmatizer import lemmatize
 
-TEST_FILES_DIR = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'test_files',
-)
+TEST_FILES_DIR = Path(__file__).parent / "test_files"
 
 
 @pytest.fixture
 def lemmatizer(NLP):
-    return NLP.Defaults.create_lemmatizer()
+    return NLP.vocab.morphology.lemmatizer
 
 
 def test_de_tagger_tag_names(NLP):
@@ -39,47 +34,41 @@ def test_de_tagger_example(NLP):
 
 # This threshold is artificially low due to problems with spacy 2.1. (#3830)
 @pytest.mark.parametrize(
-    "test_file,accuracy_threshold",
-    [("de_pud-ud-test.stts.json", 93)]
+    "test_file,accuracy_threshold", [("de_pud-ud-test.stts.json", 93)]
 )
 def test_de_tagger_corpus(NLP, test_file, accuracy_threshold):
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
     dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
     scorer = NLP.evaluate(dev_docs)
-
     assert scorer.tags_acc > accuracy_threshold
 
 
-@pytest.mark.parametrize(
-    "test_file",
-    ["de_pud-ud-test.stts.json"]
-)
+@pytest.mark.parametrize("test_file", ["de_pud-ud-test.stts.json"])
 def test_de_tagger_tagset(NLP, test_file):
     """Check that no tags outside the tagset are used."""
-    gold_tags = {"$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ",
-                 "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT",
-                 "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT",
-                 "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP",
-                 "VVINF", "VVIZU", "VVPP", "XY"}
-
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    # fmt: off
+    gold_tags = {"$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR",
+                 "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON",
+                 "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT",
+                 "PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF",
+                 "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT",
+                 "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP",
+                 "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU",
+                 "VVPP", "XY"}
+    # fmt: on
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
     dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
-
     pred_tags = set()
-    tagger = NLP.get_pipe('tagger')
-
+    tagger = NLP.get_pipe("tagger")
     for doc, _ in dev_docs:
         tagger(doc)
         pred_tags = pred_tags.union(set([t.tag_ for t in doc]))
-
     assert len(pred_tags - gold_tags) == 0
 
 
@@ -115,6 +104,7 @@ def test_de_tagger_return_char(NLP):
 @pytest.mark.xfail
 @pytest.mark.parametrize(
     "text,pos,tags",
+    # fmt: off
     [
         ('"Altbau-Wohnung".',
          ["PUNCT", "NOUN", "PUNCT", "PUNCT"],
@@ -129,6 +119,7 @@ def test_de_tagger_return_char(NLP):
          ["NOUN", "PUNCT", "ADJ", "PUNCT", "ADJ", "PUNCT", "PUNCT", "ADV", "ADJ", "PUNCT", "PUNCT"],
          ["NN", "$.", "ADJD", "$,", "ADJD", "$.", "$(", "ADV", "ADJD", "$.", "$("]),
     ],
+    # fmt: on
 )
 def test_de_tagger_punctuation(NLP, text, pos, tags):
     """Ensure punctuation is tagged correctly"""
@@ -167,7 +158,7 @@ def test_de_tagger_lemma_punct(lemmatizer):
 # Both inconsistent and weird:
 #
 #    mir -> sich / Mir -> ich
-# 
+#
 # While a token-based lookup lemmatizer is never going to work that well for
 # German, it looks like a lot of weirdness and inconsistencies were introduced
 # in https://github.com/michmech/lemmatization-lists/ . (The original resources
@@ -188,10 +179,7 @@ def test_de_tagger_lemma_issue686(NLP, text):
 
 @pytest.mark.xfail
 @pytest.mark.parametrize(
-    "text1,text2",
-    [
-        ("Dort gibt's einen Bäcker", "Dort gibt es einen Bäcker"),
-    ],
+    "text1,text2", [("Dort gibt's einen Bäcker", "Dort gibt es einen Bäcker")]
 )
 def test_de_tagger_lemma_issue717(NLP, text1, text2):
     """Test that contractions are assigned the correct lemma."""

diff --git a/tests/lang/en/test_ner.py b/tests/lang/en/test_ner.py
@@ -1,8 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
-import pytest
 
-# from spacy.tokens import Doc
+import pytest
 
 
 def test_en_ner_example(NLP):

diff --git a/tests/lang/en/test_parser.py b/tests/lang/en/test_parser.py
@@ -1,67 +1,60 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import os
 import pytest
-# from spacy.tokens import Doc
 from spacy.gold import GoldCorpus
-from spacy import util
+from pathlib import Path
 
-# from ...util import apply_transition_sequence
 
-TEST_FILES_DIR = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'test_files',
-    )
+TEST_FILES_DIR = Path(__file__).parent / "test_files"
+
 
 def test_en_parser_example(NLP):
     doc = NLP("Apple is looking at buying U.K. startup")
     deps = ["nsubj", "aux", "ROOT", "prep", "pcomp", "compound", "dobj"]
     for token, expected_dep in zip(doc, deps):
         assert token.dep_ == expected_dep
 
+
 @pytest.mark.parametrize(
     "test_file,uas_threshold,las_threshold",
     [("masc-penn-treebank-sample.json", 82, 78)],
 )
 def test_en_parser_corpus(NLP, test_file, uas_threshold, las_threshold):
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
     dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
     scorer = NLP.evaluate(dev_docs)
-
     assert scorer.uas > uas_threshold
     assert scorer.las > las_threshold
 
 
 @pytest.mark.parametrize(
-    "test_file", 
-    [
-        ("en_pud-ud-test.json"),
-        ("masc-penn-treebank-sample.json"),
-    ]
+    "test_file", [("en_pud-ud-test.json"), ("masc-penn-treebank-sample.json")]
 )
 def test_en_parser_depset(NLP, test_file):
     """Check that no tags outside the tagset are produced."""
-    gold_deps = set(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp"])
-
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    # fmt: off
+    gold_deps = set(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod",
+                     "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp",
+                     "compound", "conj", "csubj", "csubjpass", "dative", "dep",
+                     "det", "dobj", "expl", "intj", "mark", "meta", "neg",
+                     "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd",
+                     "parataxis", "pcomp", "pobj", "poss", "preconj", "predet",
+                     "prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp"])
+    # fmt: on
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
     dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
-
     pred_deps = set()
-    parser = NLP.get_pipe('parser')
-
+    parser = NLP.get_pipe("parser")
     for doc, _ in dev_docs:
         parser(doc)
         pred_deps = pred_deps.union(set([t.dep_ for t in doc]))
-
     assert len(pred_deps - gold_deps) == 0
 
 

diff --git a/tests/lang/en/test_tagger.py b/tests/lang/en/test_tagger.py
@@ -1,25 +1,20 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import os
 import pytest
 from spacy.tokens import Doc
 from spacy.compat import unicode_
 from spacy.parts_of_speech import SPACE
 from spacy.gold import GoldCorpus
-from spacy import util
+from pathlib import Path
 
-# from spacy.lemmatizer import lemmatize
 
-TEST_FILES_DIR = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'test_files',
-    )
+TEST_FILES_DIR = Path(__file__).parent / "test_files"
+
 
 @pytest.fixture
 def lemmatizer(NLP):
-    lookups = NLP.Defaults.create_lookups(NLP)
-    return NLP.Defaults.create_lemmatizer(lookups=lookups)
+    return NLP.vocab.morphology.lemmatizer
 
 
 def test_en_tagger_tag_names(NLP):
@@ -39,16 +34,13 @@ def test_en_tagger_example(NLP):
     for token, expected_tag in zip(doc, tags):
         assert token.tag_ == expected_tag
 
+
 @pytest.mark.parametrize(
     "test_file,accuracy_threshold",
-    [
-        ("en_pud-ud-test.json", 94), 
-        ("masc-penn-treebank-sample.json", 89)
-    ],
+    [("en_pud-ud-test.json", 94), ("masc-penn-treebank-sample.json", 89)],
 )
 def test_en_tagger_corpus(NLP, test_file, accuracy_threshold):
-    data_path = os.path.join(TEST_FILES_DIR, test_file)
-    data_path = util.ensure_path(data_path)
+    data_path = TEST_FILES_DIR / test_file
     if not data_path.exists():
         raise FileNotFoundError("Test corpus not found", data_path)
     corpus = GoldCorpus(data_path, data_path)
@@ -57,6 +49,7 @@ def test_en_tagger_corpus(NLP, test_file, accuracy_threshold):
 
     assert scorer.tags_acc > accuracy_threshold
 
+
 def test_en_tagger_spaces(NLP):
     """Ensure spaces are assigned the POS tag SPACE"""
     doc = NLP("Some\nspaces are\tnecessary.")

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -94,4 +94,4 @@ def iter_doc(doc):
 
 
 def test_common_issue1919(nlp):
-    opt = nlp.begin_training()
+    nlp.begin_training()