# Prepare Combined NER Dataset

This notebook will:

1. Load all your raw JSON files (with `doc` & `entities`) from `/data/raw/train/`.  
2. Create a Hugging Face `Dataset` from them.  
3. Load the OntoNotes 5.0 **train** split.  
4. Concatenate the two into one large `Dataset`.  
5. Save it to `/data/processed/` so your training notebook can load it.


In [35]:
import os
import json
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoTokenizer


In [3]:
raw_folder = "data/raw/train/"
all_examples = []

for fname in os.listdir(raw_folder):
    if not fname.endswith(".json"):
        continue
    path = os.path.join(raw_folder, fname)
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
        if isinstance(data, list):
            all_examples.extend(data)
        else:
            all_examples.append(data)

existing_ds = Dataset.from_list(all_examples)
print(f"Loaded {len(existing_ds)} examples from raw JSONs")
existing_ds.features


Loaded 51 examples from raw JSONs


{'domain': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'doc': Value(dtype='string', id=None),
 'entities': [{'id': Value(dtype='int64', id=None),
   'mentions': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
   'type': Value(dtype='string', id=None)}],
 'triples': [{'head': Value(dtype='string', id=None),
   'relation': Value(dtype='string', id=None),
   'tail': Value(dtype='string', id=None)}],
 'label_set': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'entity_label_set': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [14]:
# 3. Load OntoNotes 5.0 train split
# (this contains 'tokens' and 'tags' columns)
onto_train = load_dataset("tner/ontonotes5", split="train[:10]")
print(onto_train)
print("Columns:", onto_train.column_names)


Dataset({
    features: ['tokens', 'tags'],
    num_rows: 10
})
Columns: ['tokens', 'tags']


In [41]:

for row in onto_train:
    print(len(row['tags']))
    print(row['tags'])
    print(len(row['tokens']))
    print(row['tokens'])

9
[0, 0, 0, 0, 0, 0, 0, 0, 0]
9
['People', 'start', 'their', 'own', 'businesses', 'for', 'many', 'reasons', '.']
16
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
16
['But', 'a', 'chance', 'to', 'fill', 'out', 'sales', '-', 'tax', 'records', 'is', 'rarely', 'one', 'of', 'them', '.']
9
[0, 0, 0, 0, 0, 0, 0, 0, 0]
9
['Red', 'tape', 'is', 'the', 'bugaboo', 'of', 'small', 'business', '.']
42
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
42
['Ironically', ',', 'the', 'person', 'who', 'wants', 'to', 'run', 'his', 'or', 'her', 'own', 'business', 'is', 'probably', 'the', 'active', ',', 'results', '-', 'oriented', 'sort', 'most', 'likely', 'to', 'hate', 'meeting', 'the', 'rules', 'and', 'record', '-', 'keeping', 'demands', 'of', 'federal', ',', 'state', 'and', 'local', 'regulators', '.']
25
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
25
['Yet', 'every', 'business', 'owner', 'has', 

In [36]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [46]:
# 2) Define the original label2id and build id2label + label_list
label2id = {
    "O": 0,
    "B-CARDINAL": 1,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-NORP": 6,
    "B-GPE": 7,
    "I-GPE": 8,
    "B-LAW": 9,
    "I-LAW": 10,
    "B-ORG": 11,
    "I-ORG": 12,
    "B-PERCENT": 13,
    "I-PERCENT": 14,
    "B-ORDINAL": 15,
    "B-MONEY": 16,
    "I-MONEY": 17,
    "B-WORK_OF_ART": 18,
    "I-WORK_OF_ART": 19,
    "B-FAC": 20,
    "B-TIME": 21,
    "I-CARDINAL": 22,
    "B-LOC": 23,
    "B-QUANTITY": 24,
    "I-QUANTITY": 25,
    "I-NORP": 26,
    "I-LOC": 27,
    "B-PRODUCT": 28,
    "I-TIME": 29,
    "B-EVENT": 30,
    "I-EVENT": 31,
    "I-FAC": 32,
    "B-LANGUAGE": 33,
    "I-PRODUCT": 34,
    "I-ORDINAL": 35,
    "I-LANGUAGE": 36
}
# invert
id2label   = {v:k for k,v in label2id.items()}
label_list = [id2label[i] for i in range(len(id2label))]

# 3) Convert each OntoNotes example into your JSON schema
output = []
for ex in onto_train:
    tokens = ex["tokens"]
    tags   = [ label_list[i] for i in ex["tags"] ]

    # 1) rebuild the string and get offsets
    #    (convert tokens→string)
    doc = tokenizer.convert_tokens_to_string(tokens)
    #    then ask the tokenizer for char‐offsets
    enc = tokenizer(
        doc,
        return_offsets_mapping=True,
        add_special_tokens=False
    )
    offsets = enc["offset_mapping"]

    # 2) merge spans & pull out mention text
    entities = []
    i = 0
    id = 0
    while i < len(tags):
        if tags[i].startswith("B-"):
            etype = tags[i][2:]
            start_char, end_char = offsets[i]
            j = i + 1
            while j < len(tags) and tags[j] == f"I-{etype}":
                end_char = offsets[j][1]
                j += 1

            mention_tokens = tokens[i:j]
            mention = tokenizer.convert_tokens_to_string(mention_tokens)

            entities.append({
                "id": id,
                "type":   etype,
                "mentions": [mention]
            })
            i = j
            id += 1
        else:
            i += 1

    output.append({"doc": doc, "entities": entities})

# 4) Write out the JSON
out_dir = "data/processed"
out_file = os.path.join(out_dir, "onto_sample_10.json")
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(output)} examples to {out_file}")


Wrote 10 examples to data/processed/onto_sample_10.json


In [6]:
# 4. Concatenate your data + OntoNotes
combined = concatenate_datasets([existing_ds, onto_train])
print(f"Combined dataset size: {len(combined)} examples")
print("Combined columns:", combined.column_names)


Combined dataset size: 59975 examples
Combined columns: ['domain', 'title', 'doc', 'entities', 'triples', 'label_set', 'entity_label_set', 'tokens', 'tags']


In [8]:
# 5. Save to disk
output_path = "data/processed/"
combined.save_to_disk(output_path)
print(f"Saved combined dataset to {output_path}")


Saving the dataset (1/1 shards): 100%|██████████| 59975/59975 [00:00<00:00, 309313.69 examples/s]

Saved combined dataset to data/processed/



