Skip to content

Commit

Permalink
Tidy up and fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Oct 2, 2019
1 parent 9cc840a commit 3f24424
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 96 deletions.
4 changes: 4 additions & 0 deletions tests/.flake8
Expand Up @@ -2,3 +2,7 @@
ignore = E203, E266, E501, W503
max-line-length = 80
select = B,C,E,F,W,T4,B9
exclude =
.env,
.git,
__pycache__,
32 changes: 12 additions & 20 deletions tests/lang/de/test_parser.py
@@ -1,41 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals

import os
import pytest
# from spacy.tokens import Doc
from spacy.gold import GoldCorpus
from spacy import util
from pathlib import Path

# from ...util import apply_transition_sequence

TEST_FILES_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'test_files',
)
TEST_FILES_DIR = Path(__file__).parent / "test_files"


@pytest.mark.xfail
@pytest.mark.parametrize(
"test_file",
[("de_pud-ud-test.stts.json"),]
)
@pytest.mark.parametrize("test_file", [("de_pud-ud-test.stts.json")])
def test_de_parser_depset(NLP, test_file):
"""Check that no tags outside the tagset are used."""
gold_deps = set(["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "sp", "svp", "uc", "vo"])

data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
# fmt: off
gold_deps = set(["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd",
"cj", "cm", "cp", "cvc", "da", "dm", "ep", "ju", "mnr",
"mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par",
"pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs",
"sb", "sbp", "sp", "svp", "uc", "vo"])
# fmt: on
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))

pred_deps = set()
parser = NLP.get_pipe('parser')

parser = NLP.get_pipe("parser")
for doc, _ in dev_docs:
parser(doc)
pred_deps = pred_deps.union(set([t.dep_ for t in doc]))

assert len(pred_deps - gold_deps) == 0
56 changes: 22 additions & 34 deletions tests/lang/de/test_tagger.py
@@ -1,25 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals

import os
import pytest
from spacy.tokens import Doc
from spacy.compat import unicode_
from spacy.parts_of_speech import SPACE
from spacy.gold import GoldCorpus
from spacy import util
from pathlib import Path

# from spacy.lemmatizer import lemmatize

TEST_FILES_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'test_files',
)
TEST_FILES_DIR = Path(__file__).parent / "test_files"


@pytest.fixture
def lemmatizer(NLP):
return NLP.Defaults.create_lemmatizer()
return NLP.vocab.morphology.lemmatizer


def test_de_tagger_tag_names(NLP):
Expand All @@ -39,47 +34,41 @@ def test_de_tagger_example(NLP):

# This threshold is artificially low due to problems with spacy 2.1. (#3830)
@pytest.mark.parametrize(
"test_file,accuracy_threshold",
[("de_pud-ud-test.stts.json", 93)]
"test_file,accuracy_threshold", [("de_pud-ud-test.stts.json", 93)]
)
def test_de_tagger_corpus(NLP, test_file, accuracy_threshold):
data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
scorer = NLP.evaluate(dev_docs)

assert scorer.tags_acc > accuracy_threshold


@pytest.mark.parametrize(
"test_file",
["de_pud-ud-test.stts.json"]
)
@pytest.mark.parametrize("test_file", ["de_pud-ud-test.stts.json"])
def test_de_tagger_tagset(NLP, test_file):
"""Check that no tags outside the tagset are used."""
gold_tags = {"$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ",
"KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT",
"PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT",
"PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP",
"VVINF", "VVIZU", "VVPP", "XY"}

data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
# fmt: off
gold_tags = {"$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR",
"APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON",
"KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT",
"PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF",
"PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT",
"PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP",
"VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU",
"VVPP", "XY"}
# fmt: on
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))

pred_tags = set()
tagger = NLP.get_pipe('tagger')

tagger = NLP.get_pipe("tagger")
for doc, _ in dev_docs:
tagger(doc)
pred_tags = pred_tags.union(set([t.tag_ for t in doc]))

assert len(pred_tags - gold_tags) == 0


Expand Down Expand Up @@ -115,6 +104,7 @@ def test_de_tagger_return_char(NLP):
@pytest.mark.xfail
@pytest.mark.parametrize(
"text,pos,tags",
# fmt: off
[
('"Altbau-Wohnung".',
["PUNCT", "NOUN", "PUNCT", "PUNCT"],
Expand All @@ -129,6 +119,7 @@ def test_de_tagger_return_char(NLP):
["NOUN", "PUNCT", "ADJ", "PUNCT", "ADJ", "PUNCT", "PUNCT", "ADV", "ADJ", "PUNCT", "PUNCT"],
["NN", "$.", "ADJD", "$,", "ADJD", "$.", "$(", "ADV", "ADJD", "$.", "$("]),
],
# fmt: on
)
def test_de_tagger_punctuation(NLP, text, pos, tags):
"""Ensure punctuation is tagged correctly"""
Expand Down Expand Up @@ -167,7 +158,7 @@ def test_de_tagger_lemma_punct(lemmatizer):
# Both inconsistent and weird:
#
# mir -> sich / Mir -> ich
#
#
# While a token-based lookup lemmatizer is never going to work that well for
# German, it looks like a lot of weirdness and inconsistencies were introduced
# in https://github.com/michmech/lemmatization-lists/ . (The original resources
Expand All @@ -188,10 +179,7 @@ def test_de_tagger_lemma_issue686(NLP, text):

@pytest.mark.xfail
@pytest.mark.parametrize(
"text1,text2",
[
("Dort gibt's einen B盲cker", "Dort gibt es einen B盲cker"),
],
"text1,text2", [("Dort gibt's einen B盲cker", "Dort gibt es einen B盲cker")]
)
def test_de_tagger_lemma_issue717(NLP, text1, text2):
"""Test that contractions are assigned the correct lemma."""
Expand Down
3 changes: 1 addition & 2 deletions tests/lang/en/test_ner.py
@@ -1,8 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

# from spacy.tokens import Doc
import pytest


def test_en_ner_example(NLP):
Expand Down
41 changes: 17 additions & 24 deletions tests/lang/en/test_parser.py
@@ -1,67 +1,60 @@
# coding: utf-8
from __future__ import unicode_literals

import os
import pytest
# from spacy.tokens import Doc
from spacy.gold import GoldCorpus
from spacy import util
from pathlib import Path

# from ...util import apply_transition_sequence

TEST_FILES_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'test_files',
)
TEST_FILES_DIR = Path(__file__).parent / "test_files"


def test_en_parser_example(NLP):
doc = NLP("Apple is looking at buying U.K. startup")
deps = ["nsubj", "aux", "ROOT", "prep", "pcomp", "compound", "dobj"]
for token, expected_dep in zip(doc, deps):
assert token.dep_ == expected_dep


@pytest.mark.parametrize(
"test_file,uas_threshold,las_threshold",
[("masc-penn-treebank-sample.json", 82, 78)],
)
def test_en_parser_corpus(NLP, test_file, uas_threshold, las_threshold):
data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))
scorer = NLP.evaluate(dev_docs)

assert scorer.uas > uas_threshold
assert scorer.las > las_threshold


@pytest.mark.parametrize(
"test_file",
[
("en_pud-ud-test.json"),
("masc-penn-treebank-sample.json"),
]
"test_file", [("en_pud-ud-test.json"), ("masc-penn-treebank-sample.json")]
)
def test_en_parser_depset(NLP, test_file):
"""Check that no tags outside the tagset are produced."""
gold_deps = set(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp"])

data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
# fmt: off
gold_deps = set(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod",
"appos", "attr", "aux", "auxpass", "case", "cc", "ccomp",
"compound", "conj", "csubj", "csubjpass", "dative", "dep",
"det", "dobj", "expl", "intj", "mark", "meta", "neg",
"nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd",
"parataxis", "pcomp", "pobj", "poss", "preconj", "predet",
"prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp"])
# fmt: on
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False))

pred_deps = set()
parser = NLP.get_pipe('parser')

parser = NLP.get_pipe("parser")
for doc, _ in dev_docs:
parser(doc)
pred_deps = pred_deps.union(set([t.dep_ for t in doc]))

assert len(pred_deps - gold_deps) == 0


Expand Down
23 changes: 8 additions & 15 deletions tests/lang/en/test_tagger.py
@@ -1,25 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals

import os
import pytest
from spacy.tokens import Doc
from spacy.compat import unicode_
from spacy.parts_of_speech import SPACE
from spacy.gold import GoldCorpus
from spacy import util
from pathlib import Path

# from spacy.lemmatizer import lemmatize

TEST_FILES_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'test_files',
)
TEST_FILES_DIR = Path(__file__).parent / "test_files"


@pytest.fixture
def lemmatizer(NLP):
lookups = NLP.Defaults.create_lookups(NLP)
return NLP.Defaults.create_lemmatizer(lookups=lookups)
return NLP.vocab.morphology.lemmatizer


def test_en_tagger_tag_names(NLP):
Expand All @@ -39,16 +34,13 @@ def test_en_tagger_example(NLP):
for token, expected_tag in zip(doc, tags):
assert token.tag_ == expected_tag


@pytest.mark.parametrize(
"test_file,accuracy_threshold",
[
("en_pud-ud-test.json", 94),
("masc-penn-treebank-sample.json", 89)
],
[("en_pud-ud-test.json", 94), ("masc-penn-treebank-sample.json", 89)],
)
def test_en_tagger_corpus(NLP, test_file, accuracy_threshold):
data_path = os.path.join(TEST_FILES_DIR, test_file)
data_path = util.ensure_path(data_path)
data_path = TEST_FILES_DIR / test_file
if not data_path.exists():
raise FileNotFoundError("Test corpus not found", data_path)
corpus = GoldCorpus(data_path, data_path)
Expand All @@ -57,6 +49,7 @@ def test_en_tagger_corpus(NLP, test_file, accuracy_threshold):

assert scorer.tags_acc > accuracy_threshold


def test_en_tagger_spaces(NLP):
"""Ensure spaces are assigned the POS tag SPACE"""
doc = NLP("Some\nspaces are\tnecessary.")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_common.py
Expand Up @@ -94,4 +94,4 @@ def iter_doc(doc):


def test_common_issue1919(nlp):
opt = nlp.begin_training()
nlp.begin_training()

0 comments on commit 3f24424

Please sign in to comment.