Merge pull request #3349 from flairNLP/feature-jsonldataset-metadata

Feature jsonldataset metadata
flairNLP · Oct 24, 2023 · 423bbf8 · 423bbf8
2 parents a0e5444 + a5b2c36
commit 423bbf8
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 7 deletions.
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -48,6 +48,7 @@ def __init__(
         encoding: str = "utf-8",
         text_column_name: str = "data",
         label_column_name: str = "label",
+        metadata_column_name: str = "metadata",
         label_type: str = "ner",
         **corpusargs,
     ) -> None:
@@ -62,6 +63,7 @@ def __init__(
         :param dev_files: the name of the dev files, if empty, dev data is sampled from train
         :param text_column_name: Name of the text column inside the jsonl files.
         :param label_column_name: Name of the label column inside the jsonl files.
+        :param metadata_column_name: Name of the metadata column inside the jsonl files.
 
         :raises RuntimeError: If no paths are given
         """
@@ -72,6 +74,7 @@ def __init__(
                         train_file,
                         text_column_name=text_column_name,
                         label_column_name=label_column_name,
+                        metadata_column_name=metadata_column_name,
                         label_type=label_type,
                         encoding=encoding,
                     )
@@ -90,6 +93,7 @@ def __init__(
                         test_file,
                         text_column_name=text_column_name,
                         label_column_name=label_column_name,
+                        metadata_column_name=metadata_column_name,
                         label_type=label_type,
                     )
                     for test_file in test_files
@@ -107,6 +111,7 @@ def __init__(
                         dev_file,
                         text_column_name=text_column_name,
                         label_column_name=label_column_name,
+                        metadata_column_name=metadata_column_name,
                         label_type=label_type,
                     )
                     for dev_file in dev_files
@@ -128,6 +133,7 @@ def __init__(
         encoding: str = "utf-8",
         text_column_name: str = "data",
         label_column_name: str = "label",
+        metadata_column_name: str = "metadata",
         label_type: str = "ner",
         autofind_splits: bool = True,
         name: Optional[str] = None,
@@ -141,6 +147,7 @@ def __init__(
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
         :param text_column_name: Name of the text column inside the JSONL file.
         :param label_column_name: Name of the label column inside the JSONL file.
+        :param metadata_column_name: Name of the metadata column inside the JSONL file.
         :param autofind_splits: Whether train, test and dev file should be determined automatically
         :param name: name of the Corpus see flair.data.Corpus
         """
@@ -154,6 +161,7 @@ def __init__(
             test_files=[test_file] if test_file else [],
             text_column_name=text_column_name,
             label_column_name=label_column_name,
+            metadata_column_name=metadata_column_name,
             label_type=label_type,
             name=name if data_folder is None else str(data_folder),
             encoding=encoding,
@@ -168,21 +176,32 @@ def __init__(
         encoding: str = "utf-8",
         text_column_name: str = "data",
         label_column_name: str = "label",
+        metadata_column_name: str = "metadata",
         label_type: str = "ner",
     ) -> None:
         """Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme.
 
         The expected file format is:
-        { "<text_column_name>": "<text>", "label_column_name": [[<start_char_index>, <end_char_index>, <label>],...] }
 
-        :param path_to_json._file: File to read
-        :param text_column_name: Name of the text column
-        :param label_column_name: Name of the label column
+        .. code-block:: json
+
+            {
+                "<text_column_name>": "<text>",
+                "<label_column_name>": [[<start_char_index>, <end_char_index>, <label>],...],
+                "<metadata_column_name>": [[<metadata_key>, <metadata_value>],...]
+            }
+
+        Args:
+            path_to_jsonl_file: File to read
+            text_column_name: Name of the text column
+            label_column_name: Name of the label column
+            metadata_column_name: Name of the metadata column
         """
         path_to_json_file = Path(path_to_jsonl_file)
 
         self.text_column_name = text_column_name
         self.label_column_name = label_column_name
+        self.metadata_column_name = metadata_column_name
         self.label_type = label_type
         self.path_to_json_file = path_to_json_file
 
@@ -192,9 +211,11 @@ def __init__(
                 current_line = json.loads(line)
                 raw_text = current_line[text_column_name]
                 current_labels = current_line[label_column_name]
+                current_metadatas = current_line.get(self.metadata_column_name, [])
                 current_sentence = Sentence(raw_text)
 
                 self._add_labels_to_sentence(raw_text, current_sentence, current_labels)
+                self._add_metadatas_to_sentence(current_sentence, current_metadatas)
 
                 self.sentences.append(current_sentence)
 
@@ -248,6 +269,15 @@ def _add_label_to_sentence(self, text: str, sentence: Sentence, start: int, end:
 
         sentence[start_idx : end_idx + 1].add_label(self.label_type, label)
 
+    def _add_metadatas_to_sentence(self, sentence: Sentence, metadatas: List[Tuple[str, str]]):
+        # Add metadatas for sentence
+        for metadata in metadatas:
+            self._add_metadata_to_sentence(sentence, metadata[0], metadata[1])
+
+    @staticmethod
+    def _add_metadata_to_sentence(sentence: Sentence, metadata_key: str, metadata_value: str):
+        sentence.add_metadata(metadata_key, metadata_value)
+
     def is_in_memory(self) -> bool:
         # Currently all Jsonl Datasets are stored in Memory
         return True

diff --git a/tests/resources/tasks/jsonl/testa.jsonl b/tests/resources/tasks/jsonl/testa.jsonl
@@ -1,3 +1,3 @@
-{"id": 101319, "data": "This is New Berlin", "label": [[8, 18, "LOC"]]}
-{"id": 101320, "data": "EU rejects German call to boycott British lamb .", "label": [[0, 2, "ORG"], [11, 17, "MISC"], [34, 46, "MISC"]]}
-{"id": 101321, "data": "Peter Blackburn", "label": [[0, 15, "PER"]]}
+{"id": 101319, "data": "This is New Berlin", "label": [[8, 18, "LOC"]], "metadata": [["from", 123]]}
+{"id": 101320, "data": "EU rejects German call to boycott British lamb .", "label": [[0, 2, "ORG"], [11, 17, "MISC"], [34, 46, "MISC"]], "metadata": [["from", 124]]}
+{"id": 101321, "data": "Peter Blackburn", "label": [[0, 15, "PER"]], "metadata": [["from", 125]]}
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -923,6 +923,16 @@ def test_jsonl_corpus_loads_spans(tasks_base_path):
     assert len(example.get_spans("ner")) > 0
 
 
+def test_jsonl_corpus_loads_metadata(tasks_base_path):
+    """Tests reading a JsonlDataset containing metadata."""
+    dataset = JsonlDataset(tasks_base_path / "jsonl" / "testa.jsonl")
+
+    assert len(dataset.sentences) == 3
+    assert dataset.sentences[0].get_metadata("from") == 123
+    assert dataset.sentences[1].get_metadata("from") == 124
+    assert dataset.sentences[2].get_metadata("from") == 125
+
+
 def test_ontonotes_download():
     from urllib.parse import urlparse