Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added parameter to allow negative examples in ClassificationCorpus #2233

Merged
merged 5 commits into from
Apr 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions flair/datasets/document_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
memory_mode: str = "partial",
label_name_map: Dict[str, str] = None,
skip_labels: List[str] = None,
allow_examples_without_labels=False,
encoding: str = 'utf-8',
):
"""
Expand All @@ -55,6 +56,7 @@ def __init__(
if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
even this is too much for your memory, use 'disk'.
:param label_name_map: Optionally map label names to different schema.
:param allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
:param encoding: Default is 'uft-8' but some datasets are in 'latin-1
:return: a Corpus with annotated train, dev and test data
"""
Expand All @@ -73,6 +75,7 @@ def __init__(
memory_mode=memory_mode,
label_name_map=label_name_map,
skip_labels=skip_labels,
allow_examples_without_labels=allow_examples_without_labels,
encoding=encoding,
)

Expand All @@ -87,6 +90,7 @@ def __init__(
memory_mode=memory_mode,
label_name_map=label_name_map,
skip_labels=skip_labels,
allow_examples_without_labels=allow_examples_without_labels,
encoding=encoding,
) if test_file is not None else None

Expand All @@ -101,6 +105,7 @@ def __init__(
memory_mode=memory_mode,
label_name_map=label_name_map,
skip_labels=skip_labels,
allow_examples_without_labels=allow_examples_without_labels,
encoding=encoding,
) if dev_file is not None else None

Expand All @@ -125,6 +130,7 @@ def __init__(
memory_mode: str = "partial",
label_name_map: Dict[str, str] = None,
skip_labels: List[str] = None,
allow_examples_without_labels=False,
encoding: str = 'utf-8',
):
"""
Expand All @@ -143,6 +149,7 @@ def __init__(
if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
even this is too much for your memory, use 'disk'.
:param label_name_map: Optionally map label names to different schema.
:param allow_examples_without_labels: set to True to allow Sentences without label in the Dataset.
:param encoding: Default is 'uft-8' but some datasets are in 'latin-1
:return: list of sentences
"""
Expand All @@ -169,14 +176,15 @@ def __init__(
self.truncate_to_max_tokens = truncate_to_max_tokens
self.filter_if_longer_than = filter_if_longer_than
self.label_name_map = label_name_map
self.allow_examples_without_labels = allow_examples_without_labels

self.path_to_file = path_to_file

with open(str(path_to_file), encoding=encoding) as f:
line = f.readline()
position = 0
while line:
if "__label__" not in line or (" " not in line and "\t" not in line):
if ("__label__" not in line and not allow_examples_without_labels) or (" " not in line and "\t" not in line):
position = f.tell()
line = f.readline()
continue
Expand Down Expand Up @@ -219,7 +227,7 @@ def __init__(
text = line[l_len:].strip()

# if so, add to indices
if text and label:
if text and (label or allow_examples_without_labels):

if self.memory_mode == 'partial':
self.lines.append(line)
Expand Down Expand Up @@ -257,7 +265,7 @@ def _parse_line_to_sentence(
if self.truncate_to_max_chars > 0:
text = text[: self.truncate_to_max_chars]

if text and labels:
if text and (labels or self.allow_examples_without_label):
sentence = Sentence(text, use_tokenizer=tokenizer)

for label in labels:
Expand Down
5 changes: 5 additions & 0 deletions tests/resources/tasks/multi_class_negative_examples/dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__label__apple apple
__label__tv tv
__label__guitar guitar
__label__apple __label__tv apple tv
dev example without labels
6 changes: 6 additions & 0 deletions tests/resources/tasks/multi_class_negative_examples/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__label__guitar guitar
__label__apple apple
__label__tv tv
__label__apple __label__tv apple tv
__label__apple __label__guitar apple tv
test example without labels
8 changes: 8 additions & 0 deletions tests/resources/tasks/multi_class_negative_examples/train.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__label__tv tv
__label__apple __label__tv apple tv
__label__apple apple
__label__tv tv
__label__apple __label__tv apple tv
__label__guitar guitar
__label__guitar guitar
train example without labels
16 changes: 16 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,22 @@ def test_tagged_corpus_downsample():
assert 3 == len(corpus.train)


def test_classification_corpus_multi_labels_without_negative_examples(tasks_base_path):
corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class_negative_examples",
allow_examples_without_labels=False)
assert len(corpus.train) == 7
assert len(corpus.dev) == 4
assert len(corpus.test) == 5


def test_classification_corpus_multi_labels_with_negative_examples(tasks_base_path):
corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class_negative_examples",
allow_examples_without_labels=True)
assert len(corpus.train) == 8
assert len(corpus.dev) == 5
assert len(corpus.test) == 6


def test_spans():
sentence = Sentence("Zalando Research is located in Berlin .")

Expand Down