In [11]:
from datasets import load_dataset
import json

# Load the EU-ADR dataset
# https://huggingface.co/datasets/bigbio/euadr
dataset = load_dataset("bigbio/euadr", "euadr_bigbio_kb")

# Inspect dataset
print(dataset)

# Take one training sample
train0 = dataset["train"][0]



DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 300
    })
})


In [12]:
# -------------------------------
# Utility functions
# -------------------------------

def print_tree(data, prefix=""):
    """
    Pretty-print nested dict/list structure using ASCII branches.
    - Key and value on same line.
    - Lists show first element in detail, then a placeholder like "... (N more)".
    - Strings shown in quotes.
    """
    if isinstance(data, dict):
        keys = list(data.keys())
        for i, k in enumerate(keys):
            connector = "└── " if i == len(keys) - 1 else "├── "
            v = data[k]
            if isinstance(v, (dict, list)):
                print(f"{prefix}{connector}{k}")
                next_prefix = prefix + ("    " if i == len(keys) - 1 else "│   ")
                print_tree(v, next_prefix)
            else:
                val = f'"{v}"' if isinstance(v, str) else str(v)
                if len(val) > 60:
                    val = val[:57] + "..."
                print(f"{prefix}{connector}{k}: {val}")

    elif isinstance(data, list):
        n = len(data)
        if n == 0:
            print(f"{prefix}└── []")
        else:
            # Show first element
            first_connector = "└── " if n == 1 else "├── "
            print(f"{prefix}{first_connector}[0]")
            next_prefix = prefix + ("    " if n == 1 else "│   ")
            print_tree(data[0], next_prefix)

            # If more elements exist, show summary
            if n > 1:
                remaining = n - 1
                print(f"{prefix}└── ... ({remaining} more)")

    else:
        val = f'"{data}"' if isinstance(data, str) else str(data)
        if len(val) > 60:
            val = val[:57] + "..."
        print(f"{prefix}└── {val}")


def parse_to_json(data):
    """Convert any Python object (dict, list, etc.) into a formatted JSON string."""
    return json.dumps(data, indent=2, ensure_ascii=False)


# -------------------------------
# Load dataset and apply functions
# -------------------------------

train0 = dataset["train"][1]
print("\n--- Tree Structure ---")
print_tree(train0)

# print("\n--- JSON Representation ---")
# print(parse_to_json(train0))



--- Tree Structure ---
├── id: "4"
├── document_id: "4"
├── passages
│   ├── [0]
│   │   ├── id: "5"
│   │   ├── type: "title"
│   │   ├── text
│   │   │   └── [0]
│   │   │       └── "Prevention of early postoperative seizures in patients w...
│   │   └── offsets
│   │       └── [0]
│   │           ├── [0]
│   │           │   └── 0
│   │           └── ... (1 more)
│   └── ... (1 more)
├── entities
│   ├── [0]
│   │   ├── id: "7"
│   │   ├── type: "Diseases & Disorders"
│   │   ├── text
│   │   │   └── [0]
│   │   │       └── "early postoperative seizures"
│   │   ├── offsets
│   │   │   └── [0]
│   │   │       ├── [0]
│   │   │       │   └── 14
│   │   │       └── ... (1 more)
│   │   └── normalized
│   │       └── []
│   └── ... (40 more)
├── events
│   └── []
├── coreferences
│   └── []
└── relations
    ├── [0]
    │   ├── id: "30"
    │   ├── type: "SA"
    │   ├── arg1_id: "28"
    │   ├── arg2_id: "29"
    │   └── normalized
    │       └── []
    └── ... (9 more)


In [13]:
import re

def sentence_extract(data, symbol="<\entity><\entity>"):
    passages = data["passages"]
    entities = data["entities"]
    passage_text = "".join([p["text"][0] for p in passages])
    results = []
    sentences = re.split(r'(?<=[.!?])\s+', passage_text)
    for e in entities:
        start, end = e["offsets"][0]
        entity_text = e["text"][0]
        for s in sentences:
            if passage_text.find(s) <= start < passage_text.find(s) + len(s):
                replaced = s.replace(entity_text, symbol)
                results.append({
                    "id": e["id"],
                    "Sentence": replaced,
                    "Label": e["type"]
                })
                break
    return results

extracted = sentence_extract(train0)
for item in extracted:
    print(item)

{'id': '7', 'Sentence': 'Prevention of <\\entity><\\entity> in patients with primary brain tumors: preliminary experience with oxcarbazepine.Early postoperative seizures are defined as those that appear within the first week after surgery and are a well-known and feared complication in patients with supratentorial brain tumors.', 'Label': 'Diseases & Disorders'}
{'id': '8', 'Sentence': 'Prevention of early postoperative seizures in patients with <\\entity><\\entity>: preliminary experience with oxcarbazepine.Early postoperative seizures are defined as those that appear within the first week after surgery and are a well-known and feared complication in patients with supratentorial brain tumors.', 'Label': 'Diseases & Disorders'}
{'id': '9', 'Sentence': 'Prevention of early postoperative seizures in patients with primary brain tumors: preliminary experience with <\\entity><\\entity>.Early postoperative seizures are defined as those that appear within the first week after surgery and are 

  def sentence_extract(data, symbol="<\entity><\entity>"):


In [14]:
import json

def process_dataset(dataset, output_path="../datasets/euadr.json"):
    all_results = []
    for item in dataset:
        results = sentence_extract(item)
        all_results.extend(results)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)

process_dataset(dataset["train"])