Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for HIPE 2022 #2675

Merged
merged 6 commits into from
Mar 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@
NER_GERMAN_GERMEVAL,
NER_GERMAN_LEGAL,
NER_GERMAN_POLITICS,
NER_HIPE_2022,
NER_HUNGARIAN,
NER_ICELANDIC,
NER_JAPANESE,
Expand Down Expand Up @@ -446,6 +447,7 @@
"NER_GERMAN_GERMEVAL",
"NER_GERMAN_LEGAL",
"NER_GERMAN_POLITICS",
"NER_HIPE_2022",
"NER_HUNGARIAN",
"NER_ICELANDIC",
"NER_JAPANESE",
Expand Down
135 changes: 135 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4126,3 +4126,138 @@ def __init__(
comment_symbol="#",
**corpusargs,
)


class NER_HIPE_2022(ColumnCorpus):
def __init__(
self,
dataset_name: str,
language: str,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
version: str = "v1.0",
dev_split_name="dev",
add_document_separator=False,
sample_missing_splits=False,
**corpusargs,
):
"""
Initialize the CLEF-HIPE 2022 NER dataset. The first time you call this constructor it will automatically
download the specified dataset (by given a language).
:dataset_name: Supported datasets are: ajmc, hipe2020, letemps, newseye, sonar and topres19th.
:language: Language for a supported dataset.
:base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:tag_to_bioes: Dataset will automatically transformed into BIOES format (internally).
:in_memory: If True, keeps dataset in memory giving speedups in training.
:version: Version of CLEF-HIPE dataset. Currently only v1.0 is supported and available.
:dev_split_name: Defines default name of development split (dev by default). Only the NewsEye dataset has
currently two development splits: dev and dev2.
:add_document_separator: If True, a special document seperator will be introduced. This is highly
recommended when using our FLERT approach.
:sample_missing_splits: If True, data is automatically sampled when certain data splits are None.
"""
if not base_path:
base_path = flair.cache_root / "datasets"
else:
base_path = Path(base_path)

# Dataset split mapping
hipe_available_splits = {
"v1.0": {
"ajmc": {"de": ["sample"], "en": ["sample"]},
"hipe2020": {"de": ["train", "dev"], "en": ["dev"], "fr": ["train", "dev"]},
"letemps": {"fr": ["train", "dev"]},
"newseye": {
"de": ["train", "dev", "dev2"],
"fi": ["train", "dev", "dev2"],
"fr": ["train", "dev", "dev2"],
"sv": ["train", "dev", "dev2"],
},
"sonar": {"de": ["dev"]},
"topres19th": {"en": ["train", "dev"]},
}
}

eos_marker = "EndOfSentence"
document_separator = "# hipe2022:document_id"

# Special document marker for sample splits in AJMC dataset
if f"{version}/{dataset_name}" == "v1.0/ajmc":
document_separator = "# hipe2022:original_source"

columns = {0: "text", 1: "ner"}

dataset_base = self.__class__.__name__.lower()
data_folder = base_path / dataset_base / dataset_name / language

data_url = f"https://github.com/hipe-eval/HIPE-2022-data/raw/main/data/{version}/{dataset_name}/{language}"

dataset_splits = hipe_available_splits[version][dataset_name][language]

for split in dataset_splits:
cached_path(
f"{data_url}/HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv", data_folder / "original"
)

train_file = "train.txt" if "train" in dataset_splits else None
dev_file = f"{dev_split_name}.txt" if "sample" not in dataset_splits else "sample.txt"
test_file = "test.txt" if "test" in dataset_splits else None

new_data_folder = data_folder

if add_document_separator:
new_data_folder = new_data_folder / "with_doc_seperator"
new_data_folder.mkdir(parents=True, exist_ok=True)

dev_path = new_data_folder / dev_file

if not dev_path.exists():
for split in dataset_splits:
original_filename = f"HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv"
self.__prepare_corpus(
data_folder / "original" / original_filename,
new_data_folder / f"{split}.txt",
eos_marker,
document_separator,
add_document_separator,
)

super(NER_HIPE_2022, self).__init__(
new_data_folder,
columns,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
document_separator_token="-DOCSTART-",
skip_first_line=True,
comment_symbol="# ",
sample_missing_splits=sample_missing_splits,
**corpusargs,
)

@staticmethod
def __prepare_corpus(
file_in: Path, file_out: Path, eos_marker: str, document_separator: str, add_document_separator: bool
):
with open(file_in, "rt") as f_p:
lines = f_p.readlines()

with open(file_out, "wt") as f_out:
# Add missing newline after header
f_out.write(lines[0] + "\n")

for line in lines[1:]:
line = line.rstrip()

# Add "real" document marker
if add_document_separator and line.startswith(document_separator):
f_out.write("-DOCSTART- O\n\n")

f_out.write(line + "\n")

if eos_marker in line:
f_out.write("\n")
91 changes: 91 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,97 @@ def test_load_universal_dependencies_conllu_corpus(tasks_base_path):
_assert_universal_dependencies_conllu_dataset(corpus.train)


def test_hipe_2022_corpus(tasks_base_path):
"""
This test covers the complete v1.0 version of the HIPE 2022,
including the version with document separator.
"""

# We have manually checked, that these numbers are correct:
hipe_stats = {
"v1.0": {
"ajmc": {"de": {"sample": {"sents": 119, "docs": 8}}, "en": {"sample": {"sents": 83, "docs": 5}}},
"hipe2020": {
"de": {
"train": {"sents": 3470 + 2, "docs": 103}, # 2 sentences with missing EOS marker
"dev": {
"sents": 1202,
"docs": 33,
},
},
"en": {"dev": {"sents": 1045, "docs": 80}},
"fr": {"train": {"sents": 5743, "docs": 158}, "dev": {"sents": 1244, "docs": 43}},
},
"letemps": {"fr": {"train": {"sents": 14051, "docs": 414}, "dev": {"sents": 1341, "docs": 51}}},
"newseye": {
# +1 offset, because of missing EOS marker at EOD
"de": {
"train": {"sents": 23646 + 1, "docs": 11},
"dev": {"sents": 1110 + 1, "docs": 12},
"dev2": {"sents": 1541 + 1, "docs": 12},
},
"fi": {
"train": {"sents": 1141 + 1, "docs": 24},
"dev": {"sents": 140 + 1, "docs": 24},
"dev2": {"sents": 104 + 1, "docs": 21},
},
"fr": {
"train": {"sents": 7106 + 1, "docs": 35},
"dev": {"sents": 662 + 1, "docs": 35},
"dev2": {"sents": 1016 + 1, "docs": 35},
},
"sv": {
"train": {"sents": 1063 + 1, "docs": 21},
"dev": {"sents": 126 + 1, "docs": 21},
"dev2": {"sents": 136 + 1, "docs": 21},
},
},
"sonar": {"de": {"dev": {"sents": 1603 + 10, "docs": 10}}}, # 10 sentences with missing EOS marker
"topres19th": {"en": {"train": {"sents": 5874, "docs": 309}, "dev": {"sents": 646, "docs": 34}}},
}
}

def test_hipe_2022(dataset_version="v1.0", add_document_separator=True):
for dataset_name, languages in hipe_stats[dataset_version].items():
for language in languages:
splits = languages[language]

corpus = flair.datasets.NER_HIPE_2022(
dataset_name=dataset_name,
language=language,
dev_split_name="dev",
add_document_separator=add_document_separator,
)

for split_name, stats in splits.items():
split_description = f"{dataset_name}/{language}@{split_name}"

total_sentences = sum(stats.values()) if add_document_separator else stats["sents"]

if split_name == "train":
assert (
len(corpus.train) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.train)} vs. {total_sentences}"
elif split_name in ["dev", "sample"]:
assert (
len(corpus.dev) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}"
elif split_name == "dev2":
corpus = flair.datasets.NER_HIPE_2022(
dataset_name=dataset_name,
language=language,
dev_split_name="dev2",
add_document_separator=add_document_separator,
)

assert (
len(corpus.dev) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}"

test_hipe_2022(add_document_separator=True)
test_hipe_2022(add_document_separator=False)


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down