In [34]:
import json

from typing import Iterator

from datasets import load_dataset

In [35]:
def read_jsonl(file_path: str) -> Iterator[dict]:
    """Reads a file in jsonl format"""
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)


def write_jsonl(file_path: str, data: Iterator[dict]):
    """Writes data to a file in jsonl format"""
    with open(file_path, "w", encoding="utf-8") as f:
        for line in data:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [36]:
dataset = load_dataset("iluvvatar/RuNNE")

In [37]:
dataset_text = dataset["train"]["text"] + dataset["test"]["text"]
dataset_entities = dataset["train"]["entities"] + dataset["test"]["entities"]

# Process dataset_entities
processed_dataset_entities = []

for entities in dataset_entities:
    ners = []

    for ner in entities:
        beg, end, label = ner.split()
        ners.append([int(beg), int(end), label])

    processed_dataset_entities.append(ners)

dataset_entities = processed_dataset_entities

In [38]:
# Merge with train dataset
TEXT_HEAD_LENGTH = 16

dataset_text_heads = [text[:TEXT_HEAD_LENGTH] for text in dataset_text]

for sample in read_jsonl("train_sm.jsonl"):
    if sample["sentences"][:TEXT_HEAD_LENGTH] not in dataset_text_heads:
        dataset_text.append(sample["sentences"])
        dataset_entities.append(sample["ners"])

In [39]:
processed_data = []

for text, entities in zip(dataset_text, dataset_entities):
    processed_data.append({"sentences": text, "ners": entities})

In [40]:
write_jsonl("train.jsonl", processed_data)