### Importing Necessary Libraries

In [1]:
import pandas as pd
import glob
import os 
import json

### Loading the generated CSV files and combining into a single DataFrame

In [2]:
def combine_csv_files_concise(directory='./ner_datasets'):
    files = glob.glob(os.path.join(directory, 'ner_dataset_raw_batch_*.csv'))
    combined_df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    return combined_df

df = combine_csv_files_concise()

In [3]:
df

Unnamed: 0,text,tokens,ner_tags
0,"To enhance the performance of the circuit, the...","['To', 'enhance', 'the', 'performance', 'of', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"Using a Tektronix oscilloscope, the engineer m...","['Using', 'a', 'Tektronix', 'oscilloscope', ',...","['O', 'O', 'B-VENDOR', 'B-EQUIPMENT', 'O', 'O'..."
2,"While debugging the circuit, the engineer used...","['While', 'debugging', 'the', 'circuit', ',', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,To verify the functionality of the Arduino boa...,"['To', 'verify', 'the', 'functionality', 'of',...","['O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCT', 'O..."
4,"Using a spectrum analyzer, the technician test...","['Using', 'a', 'spectrum', 'analyzer', ',', 't...","['O', 'O', 'B-EQUIPMENT', 'I-EQUIPMENT', 'O', ..."
...,...,...,...
15090,"To improve the circuit's reliability, we utili...","['To', 'improve', 'the', ""circuit's"", 'reliabi...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
15091,While testing the circuit with a 1000Ω resisto...,"['While', 'testing', 'the', 'circuit', 'with',...","['O', 'O', 'O', 'O', 'O', 'O', 'B-DESIGN_PARAM..."
15092,"Using a Tektronix oscilloscope, the engineer m...","['Using', 'a', 'Tektronix', 'oscilloscope', ',...","['O', 'O', 'B-VENDOR', 'B-EQUIPMENT', 'O', 'O'..."
15093,"Using an Oscilloscope, she measured the 10V ou...","['Using', 'an', 'Oscilloscope', ',', 'she', 'm...","['O', 'O', 'B-EQUIPMENT', 'O', 'O', 'O', 'O', ..."


In [4]:
df['text'].duplicated().sum()

233

In [5]:
df.head(n=3)

Unnamed: 0,text,tokens,ner_tags
0,"To enhance the performance of the circuit, the...","['To', 'enhance', 'the', 'performance', 'of', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"Using a Tektronix oscilloscope, the engineer m...","['Using', 'a', 'Tektronix', 'oscilloscope', ',...","['O', 'O', 'B-VENDOR', 'B-EQUIPMENT', 'O', 'O'..."
2,"While debugging the circuit, the engineer used...","['While', 'debugging', 'the', 'circuit', ',', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


### Converting the DataFrame to HuggingFace Dataset

In [6]:
# Define the mapping
label2id = {
    "O": 0,
    "B-COMPONENT": 1,
    "I-COMPONENT": 2,
    "B-DESIGN_PARAM": 3,
    "I-DESIGN_PARAM": 4,
    "B-MATERIAL": 5,
    "I-MATERIAL": 6,
    "B-EQUIPMENT": 7,
    "I-EQUIPMENT": 8,
    "B-TECHNOLOGY": 9,
    "I-TECHNOLOGY": 10,
    "B-SOFTWARE": 11,
    "I-SOFTWARE": 12,
    "B-STANDARD": 13,
    "I-STANDARD": 14,
    "B-VENDOR": 15,
    "I-VENDOR": 16,
    "B-PRODUCT": 17,
    "I-PRODUCT": 18
}

# Convert tags to numbers with error handling
def convert_tags_to_ids(tags):
    return [label2id.get(tag, 0) for tag in tags]  # Returns 0 for any unknown tag

# Apply the conversion
df['ner_tags'] = df['ner_tags'].apply(eval)  # If needed
df['ner_tags_numeric'] = df['ner_tags'].apply(convert_tags_to_ids)

# Replace original column
df['ner_tags'] = df['ner_tags_numeric']
del df['ner_tags_numeric']

# Print unique tags to verify conversion
unique_tags = set([tag for tags in df['ner_tags'].tolist() for tag in tags])
print("Unique numeric tags in dataset:", sorted(unique_tags))

Unique numeric tags in dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


Shuffling is critical for ensuring the quality, robustness, and generalizability of models trained on the dataset. It eliminates biases introduced by the order of data in the original files and prepares the dataset for realistic evaluation and deployment scenarios.

In [7]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
df

Unnamed: 0,text,tokens,ner_tags
0,"Using a spectrum analyzer, we measured the fre...","['Using', 'a', 'spectrum', 'analyzer', ',', 'w...","[0, 0, 7, 8, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, ..."
1,"To analyze the circuit's performance, the engi...","['To', 'analyze', 'the', ""circuit's"", 'perform...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 7, 0, 0, 0,..."
2,The team tested the circuit board using a Tekt...,"['The', 'team', 'tested', 'the', 'circuit', 'b...","[0, 0, 0, 0, 0, 0, 0, 0, 15, 7, 0, 0, 0, 3, 0,..."
3,"During the testing phase, the team utilized a ...","['During', 'the', 'testing', 'phase', ',', 'th...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 3, ..."
4,"To enhance the efficiency of the circuit, we a...","['To', 'enhance', 'the', 'efficiency', 'of', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, ..."
...,...,...,...
15090,"During the installation of the FPGA module, we...","['During', 'the', 'installation', 'of', 'the',...","[0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 3, 0, 0, 0, ..."
15091,"To improve the circuit performance, we analyze...","['To', 'improve', 'the', 'circuit', 'performan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 5, 0, 0, ..."
15092,"To ensure optimal performance, the circuit des...","['To', 'ensure', 'optimal', 'performance', 'th...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 1, 0, 0, 0, ..."
15093,"During the troubleshooting phase, I used a Mul...","['During', 'the', 'troubleshooting', 'phase', ...","[0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 3, 0, 0, ..."


### Defining Dataset Features and Splits for Dataset

In [9]:
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value, DatasetInfo

# Define your labels
label_names = ["O", "B-COMPONENT", "I-COMPONENT", "B-DESIGN_PARAM", "I-DESIGN_PARAM", "B-MATERIAL", "I-MATERIAL" "B-EQUIPMENT", "I-EQUIPMENT", "B-TECHNOLOGY", "I-TECHNOLOGY", "B-SOFTWARE", "I-SOFTWARE", "B-STANDARD", "I-STANDARD" "B-VENDOR", "I-VENDOR", "B-PRODUCT", "I-PRODUCT"]

# Define features
features = Features({
    'text': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(num_classes=len(label_names), names=label_names))
})

# Add info about the dataset
description = """This dataset contains annotated sentences from the electrical engineering domain.
It includes annotations for components, parameters, materials, equipment, technologies,
software, standards, vendors, and products using BIO tagging scheme.

Tags:
- B/I-COMPONENT: Electronic parts (resistors, capacitors, etc.)
- B/I-DESIGN_PARAM: Measurements and parameters
- B/I-MATERIAL: Materials used in electronics
- B/I-EQUIPMENT: Test and measurement equipment
- B/I-TECHNOLOGY: Systems and architectures
- B/I-SOFTWARE: Software tools
- B/I-STANDARD: Protocols and standards
- B/I-VENDOR: Manufacturer names
- B/I-PRODUCT: Product names
- O: Non-entity tokens

Note: This dataset is created using gpt-4o-mini and is only for research and education purposes. For production, it is highly recommended to use human-annotated data.
"""

# Create a DatasetInfo object
my_dataset_info = DatasetInfo(
    description=description,
    dataset_name="Electrical Engineering Named Entity Recognition Dataset",
    features=features,
)

# Create dataset with features
dataset = Dataset.from_dict(
    {
        'text': df['text'].tolist(),
        'tokens': df['tokens'].apply(eval).tolist(),
        'ner_tags': df['ner_tags'].tolist()
    },
    info=my_dataset_info
)

# Create splits
dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_valid = dataset['test'].train_test_split(test_size=0.5, seed=42)

# Create final dataset dictionary
hf_dataset = DatasetDict({
    'train': dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

# Print to verify
print("Features:", hf_dataset['train'].features)
# print("\nDescription:", hf_dataset.info.description)

Features: {'text': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-COMPONENT', 'I-COMPONENT', 'B-DESIGN_PARAM', 'I-DESIGN_PARAM', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-SOFTWARE', 'I-SOFTWARE', 'B-STANDARD', 'I-STANDARD', 'B-VENDOR', 'I-VENDOR', 'B-PRODUCT', 'I-PRODUCT'], id=None), length=-1, id=None)}


In [10]:
hf_dataset["train"][0]

{'text': 'Using a Multimeter, the technician measured the 10 kΩ resistance of a Copper wire in the circuit.',
 'tokens': ['Using',
  'a',
  'Multimeter',
  ',',
  'the',
  'technician',
  'measured',
  'the',
  '10',
  'kΩ',
  'resistance',
  'of',
  'a',
  'Copper',
  'wire',
  'in',
  'the',
  'circuit',
  '.'],
 'ner_tags': [0, 0, 7, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0]}

In [11]:
hf_dataset["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-COMPONENT', 'I-COMPONENT', 'B-DESIGN_PARAM', 'I-DESIGN_PARAM', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-SOFTWARE', 'I-SOFTWARE', 'B-STANDARD', 'I-STANDARD', 'B-VENDOR', 'I-VENDOR', 'B-PRODUCT', 'I-PRODUCT'], id=None), length=-1, id=None)

In [12]:
print(hf_dataset['train'].description)

This dataset contains annotated sentences from the electrical engineering domain.
It includes annotations for components, parameters, materials, equipment, technologies,
software, standards, vendors, and products using BIO tagging scheme.

Tags:
- B/I-COMPONENT: Electronic parts (resistors, capacitors, etc.)
- B/I-DESIGN_PARAM: Measurements and parameters
- B/I-MATERIAL: Materials used in electronics
- B/I-EQUIPMENT: Test and measurement equipment
- B/I-TECHNOLOGY: Systems and architectures
- B/I-SOFTWARE: Software tools
- B/I-STANDARD: Protocols and standards
- B/I-VENDOR: Manufacturer names
- B/I-PRODUCT: Product names
- O: Non-entity tokens

Note: This dataset is created using gpt-4o-mini and is only for research and education purposes. For production, it is highly recommended to use human-annotated data.



In [13]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 12076
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1509
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1510
    })
})

### Saving the Dataset

In [14]:
# Suppose you have a Dataset or DatasetDict called 'hf_dataset'
hf_dataset.save_to_disk("electrical_engineering_ner_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/12076 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1509 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1510 [00:00<?, ? examples/s]