# Prepare Combined NER Dataset

This notebook will:

1. Load all your raw JSON files (with `doc` & `entities`) from `/data/raw/train/`.  
2. Create a Hugging Face `Dataset` from them.  
3. Load the OntoNotes 5.0 **train** split.  
4. Concatenate the two into one large `Dataset`.  
5. Save it to `/data/processed/` so your training notebook can load it.


In [None]:
import os
import json
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
raw_folder = "data/raw/train/"
all_examples = []

for fname in os.listdir(raw_folder):
    if not fname.endswith(".json"):
        continue
    path = os.path.join(raw_folder, fname)
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
        if isinstance(data, list):
            all_examples.extend(data)
        else:
            all_examples.append(data)

existing_ds = Dataset.from_list(all_examples)
print(f"Loaded {len(existing_ds)} examples from raw JSONs")
existing_ds.features


In [None]:
# 3. Load OntoNotes 5.0 train split
# (this contains 'tokens' and 'tags' columns)
onto_train = load_dataset("tner/ontonotes5", split="train")
print(onto_train)
print("Columns:", onto_train.column_names)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
# 2) Define the original label2id and build id2label + label_list
label2id = {
    "O": 0,
    "B-CARDINAL": 1,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-NORP": 6,
    "B-GPE": 7,
    "I-GPE": 8,
    "B-LAW": 9,
    "I-LAW": 10,
    "B-ORG": 11,
    "I-ORG": 12,
    "B-PERCENT": 13,
    "I-PERCENT": 14,
    "B-ORDINAL": 15,
    "B-MONEY": 16,
    "I-MONEY": 17,
    "B-WORK_OF_ART": 18,
    "I-WORK_OF_ART": 19,
    "B-FAC": 20,
    "B-TIME": 21,
    "I-CARDINAL": 22,
    "B-LOC": 23,
    "B-QUANTITY": 24,
    "I-QUANTITY": 25,
    "I-NORP": 26,
    "I-LOC": 27,
    "B-PRODUCT": 28,
    "I-TIME": 29,
    "B-EVENT": 30,
    "I-EVENT": 31,
    "I-FAC": 32,
    "B-LANGUAGE": 33,
    "I-PRODUCT": 34,
    "I-ORDINAL": 35,
    "I-LANGUAGE": 36
}
# invert
id2label   = {v:k for k,v in label2id.items()}
label_list = [id2label[i] for i in range(len(id2label))]

# 3) Convert each OntoNotes example into your JSON schema
output = []
for ex in onto_train:
    tokens = ex["tokens"]
    tags   = [ label_list[i] for i in ex["tags"] ]

    # 1) rebuild the string and get offsets
    #    (convert tokens→string)
    doc = tokenizer.convert_tokens_to_string(tokens)
    #    then ask the tokenizer for char‐offsets
    enc = tokenizer(
        doc,
        return_offsets_mapping=True,
        add_special_tokens=False
    )
    offsets = enc["offset_mapping"]

    # 2) merge spans & pull out mention text
    entities = []
    i = 0
    id = 0
    while i < len(tags):
        if tags[i].startswith("B-"):
            etype = tags[i][2:]
            start_char, end_char = offsets[i]
            j = i + 1
            while j < len(tags) and tags[j] == f"I-{etype}":
                end_char = offsets[j][1]
                j += 1

            mention_tokens = tokens[i:j]
            mention = tokenizer.convert_tokens_to_string(mention_tokens)

            entities.append({
                "id": id,
                "type":   etype,
                "mentions": [mention]
            })
            i = j
            id += 1
        else:
            i += 1

    output.append(
        {
            "title": "Onto dataset",
            "doc": doc, 
            "entities": entities
        }
    )

# 4) Write out the JSON
out_dir = "data/processed"
out_file = os.path.join(out_dir, "onto_sample_10.json")
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(output)} examples to {out_file}")


In [None]:
# 4. Concatenate your data + OntoNotes
combined_ds = concatenate_datasets([existing_ds, onto_train])
print(f"Combined dataset size: {len(combined_ds)} examples")
print("Combined columns:", combined_ds.column_names)


In [None]:
# 5. Save to disk
output_path = "data/processed/hf_datasets/"
combined_ds.save_to_disk(output_path)
print(f"Saved combined dataset to {output_path}")


In [None]:
# Initialize a counter for entity types
entity_type_counts = Counter()

# Count occurrences of each entity type across all documents
for record in combined_ds:
    if 'labels' in record:  # OntoNotes format
        # Convert numeric tags to label names using id2label
        tags = [id2label[tag] for tag in record['labels']]
        # Count only the B- tags (beginning of entities)
        for tag in tags:
            if tag.startswith('B-'):
                entity_type = tag[2:]  # Remove 'B-' prefix
                entity_type_counts[entity_type] += 1
    elif 'entities' in record:  # Your JSON format
        entities = record['entities']
        if entities == None:
            continue
        for entity in entities:
            entity_type = entity.get('type', '')
            if entity_type:
                entity_type_counts[entity_type] += 1

# Create lists for plotting
types = list(entity_type_counts.keys())
counts = list(entity_type_counts.values())

# Sort by counts in descending order
types, counts = zip(*sorted(zip(types, counts), key=lambda x: x[1], reverse=True))

# Create bar plot
plt.figure(figsize=(15, 8))
bars = plt.bar(types, counts)
plt.xticks(rotation=45, ha='right')
plt.title('Frequency of Entity Types in Combined Dataset')
plt.xlabel('Entity Type')
plt.ylabel('Count')

# Add value labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}',
             ha='center', va='bottom')

# Adjust layout to prevent label cutoff
plt.tight_layout()
plt.show()