In [1]:
from datasets import load_dataset
import json

# GAD dataset
# https://huggingface.co/datasets/bigbio/gad
dataset = load_dataset("bigbio/gad", "gad_blurb_bigbio_text")

# Inspect dataset
print(dataset)

# Take one training sample
train0 = dataset["train"][0]


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'text', 'labels'],
        num_rows: 4261
    })
    validation: Dataset({
        features: ['id', 'document_id', 'text', 'labels'],
        num_rows: 535
    })
    test: Dataset({
        features: ['id', 'document_id', 'text', 'labels'],
        num_rows: 534
    })
})


In [5]:
# -------------------------------
# Utility functions
# -------------------------------

def print_tree(data, prefix=""):
    """
    Pretty-print nested dict/list structure using ASCII branches.
    - Key and value on same line.
    - Lists show first element in detail, then a placeholder like "... (N more)".
    - Strings shown in quotes.
    """
    if isinstance(data, dict):
        keys = list(data.keys())
        for i, k in enumerate(keys):
            connector = "└── " if i == len(keys) - 1 else "├── "
            v = data[k]
            if isinstance(v, (dict, list)):
                print(f"{prefix}{connector}{k}")
                next_prefix = prefix + ("    " if i == len(keys) - 1 else "│   ")
                print_tree(v, next_prefix)
            else:
                val = f'"{v}"' if isinstance(v, str) else str(v)
                if len(val) > 60:
                    val = val[:57] + "..."
                print(f"{prefix}{connector}{k}: {val}")

    elif isinstance(data, list):
        n = len(data)
        if n == 0:
            print(f"{prefix}└── []")
        else:
            # Show first element
            first_connector = "└── " if n == 1 else "├── "
            print(f"{prefix}{first_connector}[0]")
            next_prefix = prefix + ("    " if n == 1 else "│   ")
            print_tree(data[0], next_prefix)

            # If more elements exist, show summary
            if n > 1:
                remaining = n - 1
                print(f"{prefix}└── ... ({remaining} more)")

    else:
        val = f'"{data}"' if isinstance(data, str) else str(data)
        if len(val) > 60:
            val = val[:57] + "..."
        print(f"{prefix}└── {val}")


def parse_to_json(data):
    """Convert any Python object (dict, list, etc.) into a formatted JSON string."""
    return json.dumps(data, indent=2, ensure_ascii=False)


# -------------------------------
# Load dataset and apply functions
# -------------------------------

train0 = dataset["train"][45]
print("\n--- Tree Structure ---")
print_tree(train0)

print("\n--- JSON Representation ---")
print(parse_to_json(train0))



--- Tree Structure ---
├── id: "45"
├── document_id: "45"
├── text: "The interaction of @GENE$ and ADD3 gene variants in huma...
└── labels
    └── [0]
        └── "0"

--- JSON Representation ---
{
  "id": "45",
  "document_id": "45",
  "text": "The interaction of @GENE$ and ADD3 gene variants in humans is statistically associated with variation in @DISEASE$ pressure, suggesting the presence of epistatic effects among these loci.",
  "labels": [
    "0"
  ]
}


In [None]:
import re

def sentence_extract(data, symbol="<\\entity>", replace=False):
    s = data["text"]
    entity_pattern = r"@(.+?)\$"
    matches = re.findall(entity_pattern, s)
    if not matches:
        return []
    new_s = s
    for m in matches:
        if replace:
            new_s = new_s.replace(f"@{m}$", symbol)
        else:
            new_s = new_s.replace(f"@{m}$", f"{symbol}{m}{symbol}")
    return [{
        "id": data.get("id", None),
        "Sentence": new_s,
        "Label": data["labels"][0] if data.get("labels") else None
    }]

# lable 1 means marked gene and disease have positive relationship
# lable 0 means no relationship

sentence_extract(train0)

[{'id': '45',
  'Sentence': 'The interaction of <\\entity>GENE<\\entity> and ADD3 gene variants in humans is statistically associated with variation in <\\entity>DISEASE<\\entity> pressure, suggesting the presence of epistatic effects among these loci.',
  'Label': '0'}]

In [14]:
import json

def process_dataset(dataset, output_path="../datasets/gad.json"):
    all_results = []
    for item in dataset:
        results = sentence_extract(item)
        all_results.extend(results)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)

process_dataset(dataset["train"])