### Go to the main directory

In [None]:
%cd ..

### Importing Necessary Libraries

In [1]:
import os 
from datasets import load_from_disk
from dotenv import load_dotenv
from huggingface_hub import login

_ = load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
login(token=HF_TOKEN)  # Get token from: https://huggingface.co/settings/tokens

### Dataset Inspection

In [2]:
ee_ner_dataset = load_from_disk("electrical_engineering_ner_dataset")
print(ee_ner_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 12076
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1509
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1510
    })
})


In [3]:
ee_ner_dataset.shape

{'train': (12076, 3), 'validation': (1509, 3), 'test': (1510, 3)}

In [4]:
ee_ner_dataset["train"][0]

{'text': 'Using a Multimeter, the technician measured the 10 kΩ resistance of a Copper wire in the circuit.',
 'tokens': ['Using',
  'a',
  'Multimeter',
  ',',
  'the',
  'technician',
  'measured',
  'the',
  '10',
  'kΩ',
  'resistance',
  'of',
  'a',
  'Copper',
  'wire',
  'in',
  'the',
  'circuit',
  '.'],
 'ner_tags': [0, 0, 7, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0]}

In [5]:
ee_ner_dataset["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-COMPONENT', 'I-COMPONENT', 'B-DESIGN_PARAM', 'I-DESIGN_PARAM', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-SOFTWARE', 'I-SOFTWARE', 'B-STANDARD', 'I-STANDARD', 'B-VENDOR', 'I-VENDOR', 'B-PRODUCT', 'I-PRODUCT'], id=None), length=-1, id=None)

In [6]:
print(ee_ner_dataset['train'].description)

This dataset contains annotated sentences from the electrical engineering domain.
It includes annotations for components, parameters, materials, equipment, technologies,
software, standards, vendors, and products using BIO tagging scheme.

Tags:
- B/I-COMPONENT: Electronic parts (resistors, capacitors, etc.)
- B/I-DESIGN_PARAM: Measurements and parameters
- B/I-MATERIAL: Materials used in electronics
- B/I-EQUIPMENT: Test and measurement equipment
- B/I-TECHNOLOGY: Systems and architectures
- B/I-SOFTWARE: Software tools
- B/I-STANDARD: Protocols and standards
- B/I-VENDOR: Manufacturer names
- B/I-PRODUCT: Product names
- O: Non-entity tokens

Note: This dataset is created using gpt-4o-mini and is only for research and education purposes. For production, it is highly recommended to use human-annotated data.



### Uploading Dataset to HuggingFace Hub

In [7]:
# Push to hub with specific configurations
ee_ner_dataset.push_to_hub(
    repo_id="disham993/ElectricalNER",  # e.g., "johndoe/electrical-ner"
    private=False,  # Set to True if you want a private repository
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/disham993/ElectricalNER/commit/d41536bde2ffe5b7bf559f15f6a743aac6e6c421', commit_message='Upload dataset', commit_description='', oid='d41536bde2ffe5b7bf559f15f6a743aac6e6c421', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/disham993/ElectricalNER', endpoint='https://huggingface.co', repo_type='dataset', repo_id='disham993/ElectricalNER'), pr_revision=None, pr_num=None)