diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index a1f13feee..17172e121 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -184,6 +184,7 @@ NER_GERMAN_GERMEVAL, NER_GERMAN_LEGAL, NER_GERMAN_POLITICS, + NER_HIPE_2022, NER_HUNGARIAN, NER_ICELANDIC, NER_JAPANESE, @@ -446,6 +447,7 @@ "NER_GERMAN_GERMEVAL", "NER_GERMAN_LEGAL", "NER_GERMAN_POLITICS", + "NER_HIPE_2022", "NER_HUNGARIAN", "NER_ICELANDIC", "NER_JAPANESE", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 20373f9cc..18bd46be9 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4126,3 +4126,138 @@ def __init__( comment_symbol="#", **corpusargs, ) + + +class NER_HIPE_2022(ColumnCorpus): + def __init__( + self, + dataset_name: str, + language: str, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + version: str = "v1.0", + dev_split_name="dev", + add_document_separator=False, + sample_missing_splits=False, + **corpusargs, + ): + """ + Initialize the CLEF-HIPE 2022 NER dataset. The first time you call this constructor it will automatically + download the specified dataset (by given a language). + :dataset_name: Supported datasets are: ajmc, hipe2020, letemps, newseye, sonar and topres19th. + :language: Language for a supported dataset. + :base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :tag_to_bioes: Dataset will automatically transformed into BIOES format (internally). + :in_memory: If True, keeps dataset in memory giving speedups in training. + :version: Version of CLEF-HIPE dataset. Currently only v1.0 is supported and available. + :dev_split_name: Defines default name of development split (dev by default). Only the NewsEye dataset has + currently two development splits: dev and dev2. + :add_document_separator: If True, a special document seperator will be introduced. This is highly + recommended when using our FLERT approach. + :sample_missing_splits: If True, data is automatically sampled when certain data splits are None. + """ + if not base_path: + base_path = flair.cache_root / "datasets" + else: + base_path = Path(base_path) + + # Dataset split mapping + hipe_available_splits = { + "v1.0": { + "ajmc": {"de": ["sample"], "en": ["sample"]}, + "hipe2020": {"de": ["train", "dev"], "en": ["dev"], "fr": ["train", "dev"]}, + "letemps": {"fr": ["train", "dev"]}, + "newseye": { + "de": ["train", "dev", "dev2"], + "fi": ["train", "dev", "dev2"], + "fr": ["train", "dev", "dev2"], + "sv": ["train", "dev", "dev2"], + }, + "sonar": {"de": ["dev"]}, + "topres19th": {"en": ["train", "dev"]}, + } + } + + eos_marker = "EndOfSentence" + document_separator = "# hipe2022:document_id" + + # Special document marker for sample splits in AJMC dataset + if f"{version}/{dataset_name}" == "v1.0/ajmc": + document_separator = "# hipe2022:original_source" + + columns = {0: "text", 1: "ner"} + + dataset_base = self.__class__.__name__.lower() + data_folder = base_path / dataset_base / dataset_name / language + + data_url = f"https://github.com/hipe-eval/HIPE-2022-data/raw/main/data/{version}/{dataset_name}/{language}" + + dataset_splits = hipe_available_splits[version][dataset_name][language] + + for split in dataset_splits: + cached_path( + f"{data_url}/HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv", data_folder / "original" + ) + + train_file = "train.txt" if "train" in dataset_splits else None + dev_file = f"{dev_split_name}.txt" if "sample" not in dataset_splits else "sample.txt" + test_file = "test.txt" if "test" in dataset_splits else None + + new_data_folder = data_folder + + if add_document_separator: + new_data_folder = new_data_folder / "with_doc_seperator" + new_data_folder.mkdir(parents=True, exist_ok=True) + + dev_path = new_data_folder / dev_file + + if not dev_path.exists(): + for split in dataset_splits: + original_filename = f"HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv" + self.__prepare_corpus( + data_folder / "original" / original_filename, + new_data_folder / f"{split}.txt", + eos_marker, + document_separator, + add_document_separator, + ) + + super(NER_HIPE_2022, self).__init__( + new_data_folder, + columns, + train_file=train_file, + dev_file=dev_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + document_separator_token="-DOCSTART-", + skip_first_line=True, + comment_symbol="# ", + sample_missing_splits=sample_missing_splits, + **corpusargs, + ) + + @staticmethod + def __prepare_corpus( + file_in: Path, file_out: Path, eos_marker: str, document_separator: str, add_document_separator: bool + ): + with open(file_in, "rt") as f_p: + lines = f_p.readlines() + + with open(file_out, "wt") as f_out: + # Add missing newline after header + f_out.write(lines[0] + "\n") + + for line in lines[1:]: + line = line.rstrip() + + # Add "real" document marker + if add_document_separator and line.startswith(document_separator): + f_out.write("-DOCSTART- O\n\n") + + f_out.write(line + "\n") + + if eos_marker in line: + f_out.write("\n") diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f312185be..789b070db 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -364,6 +364,97 @@ def test_load_universal_dependencies_conllu_corpus(tasks_base_path): _assert_universal_dependencies_conllu_dataset(corpus.train) +def test_hipe_2022_corpus(tasks_base_path): + """ + This test covers the complete v1.0 version of the HIPE 2022, + including the version with document separator. + """ + + # We have manually checked, that these numbers are correct: + hipe_stats = { + "v1.0": { + "ajmc": {"de": {"sample": {"sents": 119, "docs": 8}}, "en": {"sample": {"sents": 83, "docs": 5}}}, + "hipe2020": { + "de": { + "train": {"sents": 3470 + 2, "docs": 103}, # 2 sentences with missing EOS marker + "dev": { + "sents": 1202, + "docs": 33, + }, + }, + "en": {"dev": {"sents": 1045, "docs": 80}}, + "fr": {"train": {"sents": 5743, "docs": 158}, "dev": {"sents": 1244, "docs": 43}}, + }, + "letemps": {"fr": {"train": {"sents": 14051, "docs": 414}, "dev": {"sents": 1341, "docs": 51}}}, + "newseye": { + # +1 offset, because of missing EOS marker at EOD + "de": { + "train": {"sents": 23646 + 1, "docs": 11}, + "dev": {"sents": 1110 + 1, "docs": 12}, + "dev2": {"sents": 1541 + 1, "docs": 12}, + }, + "fi": { + "train": {"sents": 1141 + 1, "docs": 24}, + "dev": {"sents": 140 + 1, "docs": 24}, + "dev2": {"sents": 104 + 1, "docs": 21}, + }, + "fr": { + "train": {"sents": 7106 + 1, "docs": 35}, + "dev": {"sents": 662 + 1, "docs": 35}, + "dev2": {"sents": 1016 + 1, "docs": 35}, + }, + "sv": { + "train": {"sents": 1063 + 1, "docs": 21}, + "dev": {"sents": 126 + 1, "docs": 21}, + "dev2": {"sents": 136 + 1, "docs": 21}, + }, + }, + "sonar": {"de": {"dev": {"sents": 1603 + 10, "docs": 10}}}, # 10 sentences with missing EOS marker + "topres19th": {"en": {"train": {"sents": 5874, "docs": 309}, "dev": {"sents": 646, "docs": 34}}}, + } + } + + def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): + for dataset_name, languages in hipe_stats[dataset_version].items(): + for language in languages: + splits = languages[language] + + corpus = flair.datasets.NER_HIPE_2022( + dataset_name=dataset_name, + language=language, + dev_split_name="dev", + add_document_separator=add_document_separator, + ) + + for split_name, stats in splits.items(): + split_description = f"{dataset_name}/{language}@{split_name}" + + total_sentences = sum(stats.values()) if add_document_separator else stats["sents"] + + if split_name == "train": + assert ( + len(corpus.train) == total_sentences + ), f"Sentence count mismatch for {split_description}: {len(corpus.train)} vs. {total_sentences}" + elif split_name in ["dev", "sample"]: + assert ( + len(corpus.dev) == total_sentences + ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" + elif split_name == "dev2": + corpus = flair.datasets.NER_HIPE_2022( + dataset_name=dataset_name, + language=language, + dev_split_name="dev2", + add_document_separator=add_document_separator, + ) + + assert ( + len(corpus.dev) == total_sentences + ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" + + test_hipe_2022(add_document_separator=True) + test_hipe_2022(add_document_separator=False) + + def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): corpus = MultiFileJsonlCorpus( train_files=[tasks_base_path / "jsonl/train.jsonl"],