Skip to content

Commit

Permalink
Merge pull request #3349 from flairNLP/feature-jsonldataset-metadata
Browse files Browse the repository at this point in the history
Feature jsonldataset metadata
  • Loading branch information
alanakbik committed Oct 24, 2023
2 parents a0e5444 + a5b2c36 commit 423bbf8
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 7 deletions.
38 changes: 34 additions & 4 deletions flair/datasets/sequence_labeling.py
Expand Up @@ -48,6 +48,7 @@ def __init__(
encoding: str = "utf-8",
text_column_name: str = "data",
label_column_name: str = "label",
metadata_column_name: str = "metadata",
label_type: str = "ner",
**corpusargs,
) -> None:
Expand All @@ -62,6 +63,7 @@ def __init__(
:param dev_files: the name of the dev files, if empty, dev data is sampled from train
:param text_column_name: Name of the text column inside the jsonl files.
:param label_column_name: Name of the label column inside the jsonl files.
:param metadata_column_name: Name of the metadata column inside the jsonl files.
:raises RuntimeError: If no paths are given
"""
Expand All @@ -72,6 +74,7 @@ def __init__(
train_file,
text_column_name=text_column_name,
label_column_name=label_column_name,
metadata_column_name=metadata_column_name,
label_type=label_type,
encoding=encoding,
)
Expand All @@ -90,6 +93,7 @@ def __init__(
test_file,
text_column_name=text_column_name,
label_column_name=label_column_name,
metadata_column_name=metadata_column_name,
label_type=label_type,
)
for test_file in test_files
Expand All @@ -107,6 +111,7 @@ def __init__(
dev_file,
text_column_name=text_column_name,
label_column_name=label_column_name,
metadata_column_name=metadata_column_name,
label_type=label_type,
)
for dev_file in dev_files
Expand All @@ -128,6 +133,7 @@ def __init__(
encoding: str = "utf-8",
text_column_name: str = "data",
label_column_name: str = "label",
metadata_column_name: str = "metadata",
label_type: str = "ner",
autofind_splits: bool = True,
name: Optional[str] = None,
Expand All @@ -141,6 +147,7 @@ def __init__(
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:param text_column_name: Name of the text column inside the JSONL file.
:param label_column_name: Name of the label column inside the JSONL file.
:param metadata_column_name: Name of the metadata column inside the JSONL file.
:param autofind_splits: Whether train, test and dev file should be determined automatically
:param name: name of the Corpus see flair.data.Corpus
"""
Expand All @@ -154,6 +161,7 @@ def __init__(
test_files=[test_file] if test_file else [],
text_column_name=text_column_name,
label_column_name=label_column_name,
metadata_column_name=metadata_column_name,
label_type=label_type,
name=name if data_folder is None else str(data_folder),
encoding=encoding,
Expand All @@ -168,21 +176,32 @@ def __init__(
encoding: str = "utf-8",
text_column_name: str = "data",
label_column_name: str = "label",
metadata_column_name: str = "metadata",
label_type: str = "ner",
) -> None:
"""Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme.
The expected file format is:
{ "<text_column_name>": "<text>", "label_column_name": [[<start_char_index>, <end_char_index>, <label>],...] }
:param path_to_json._file: File to read
:param text_column_name: Name of the text column
:param label_column_name: Name of the label column
.. code-block:: json
{
"<text_column_name>": "<text>",
"<label_column_name>": [[<start_char_index>, <end_char_index>, <label>],...],
"<metadata_column_name>": [[<metadata_key>, <metadata_value>],...]
}
Args:
path_to_jsonl_file: File to read
text_column_name: Name of the text column
label_column_name: Name of the label column
metadata_column_name: Name of the metadata column
"""
path_to_json_file = Path(path_to_jsonl_file)

self.text_column_name = text_column_name
self.label_column_name = label_column_name
self.metadata_column_name = metadata_column_name
self.label_type = label_type
self.path_to_json_file = path_to_json_file

Expand All @@ -192,9 +211,11 @@ def __init__(
current_line = json.loads(line)
raw_text = current_line[text_column_name]
current_labels = current_line[label_column_name]
current_metadatas = current_line.get(self.metadata_column_name, [])
current_sentence = Sentence(raw_text)

self._add_labels_to_sentence(raw_text, current_sentence, current_labels)
self._add_metadatas_to_sentence(current_sentence, current_metadatas)

self.sentences.append(current_sentence)

Expand Down Expand Up @@ -248,6 +269,15 @@ def _add_label_to_sentence(self, text: str, sentence: Sentence, start: int, end:

sentence[start_idx : end_idx + 1].add_label(self.label_type, label)

def _add_metadatas_to_sentence(self, sentence: Sentence, metadatas: List[Tuple[str, str]]):
# Add metadatas for sentence
for metadata in metadatas:
self._add_metadata_to_sentence(sentence, metadata[0], metadata[1])

@staticmethod
def _add_metadata_to_sentence(sentence: Sentence, metadata_key: str, metadata_value: str):
sentence.add_metadata(metadata_key, metadata_value)

def is_in_memory(self) -> bool:
# Currently all Jsonl Datasets are stored in Memory
return True
Expand Down
6 changes: 3 additions & 3 deletions tests/resources/tasks/jsonl/testa.jsonl
@@ -1,3 +1,3 @@
{"id": 101319, "data": "This is New Berlin", "label": [[8, 18, "LOC"]]}
{"id": 101320, "data": "EU rejects German call to boycott British lamb .", "label": [[0, 2, "ORG"], [11, 17, "MISC"], [34, 46, "MISC"]]}
{"id": 101321, "data": "Peter Blackburn", "label": [[0, 15, "PER"]]}
{"id": 101319, "data": "This is New Berlin", "label": [[8, 18, "LOC"]], "metadata": [["from", 123]]}
{"id": 101320, "data": "EU rejects German call to boycott British lamb .", "label": [[0, 2, "ORG"], [11, 17, "MISC"], [34, 46, "MISC"]], "metadata": [["from", 124]]}
{"id": 101321, "data": "Peter Blackburn", "label": [[0, 15, "PER"]], "metadata": [["from", 125]]}
10 changes: 10 additions & 0 deletions tests/test_datasets.py
Expand Up @@ -923,6 +923,16 @@ def test_jsonl_corpus_loads_spans(tasks_base_path):
assert len(example.get_spans("ner")) > 0


def test_jsonl_corpus_loads_metadata(tasks_base_path):
"""Tests reading a JsonlDataset containing metadata."""
dataset = JsonlDataset(tasks_base_path / "jsonl" / "testa.jsonl")

assert len(dataset.sentences) == 3
assert dataset.sentences[0].get_metadata("from") == 123
assert dataset.sentences[1].get_metadata("from") == 124
assert dataset.sentences[2].get_metadata("from") == 125


def test_ontonotes_download():
from urllib.parse import urlparse

Expand Down

0 comments on commit 423bbf8

Please sign in to comment.