# Data Tokenization

### Changing to the main directory

In [None]:
%cd ..

### Importing Necessary Libraries

In [2]:
import os 
from transformers import BertTokenizerFast
from datasets import load_dataset

from utilities import MODEL_ID, DATASET_ID, OUTPUT_DATASET_PATH

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

### Loading the Dataset

Loading the electrical NER dataset, which contains text, tokens and corresponding entity tags.

In [3]:
electrical_ner_dataset = load_dataset(DATASET_ID, trust_remote_code=True)
print(electrical_ner_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 12076
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1509
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 1510
    })
})


In [4]:
electrical_ner_dataset.shape

{'train': (12076, 3), 'validation': (1509, 3), 'test': (1510, 3)}

In [None]:
electrical_ner_dataset["train"][0]

{'text': 'Using a Multimeter, the technician measured the 10 kΩ resistance of a Copper wire in the circuit.',
 'tokens': ['Using',
  'a',
  'Multimeter',
  ',',
  'the',
  'technician',
  'measured',
  'the',
  '10',
  'kΩ',
  'resistance',
  'of',
  'a',
  'Copper',
  'wire',
  'in',
  'the',
  'circuit',
  '.'],
 'ner_tags': [0, 0, 7, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0]}

In [6]:
print(electrical_ner_dataset["train"].features["ner_tags"])

Sequence(feature=ClassLabel(names=['O', 'B-COMPONENT', 'I-COMPONENT', 'B-DESIGN_PARAM', 'I-DESIGN_PARAM', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-SOFTWARE', 'I-SOFTWARE', 'B-STANDARD', 'I-STANDARD', 'B-VENDOR', 'I-VENDOR', 'B-PRODUCT', 'I-PRODUCT'], id=None), length=-1, id=None)


In [7]:
print(electrical_ner_dataset['train'].description)




### Tokenize and align labels

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_ID)

Function to tokenize text and align NER labels with tokenized subwords.

In [9]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Displaying tokens and their corresponding aligned NER labels after tokenization.

In [10]:
q = tokenize_and_align_labels(examples=electrical_ner_dataset['train'][4:5])
print(q)

{'input_ids': [[101, 2000, 11598, 1996, 8122, 1997, 1996, 2373, 22686, 1010, 2057, 7528, 1037, 12247, 7077, 2478, 2019, 6728, 1011, 23713, 1998, 7594, 1996, 6434, 2783, 2012, 23842, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 3, 0, 3, 0, -100]]}


In [11]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
to______________________________________ 0
enhance_________________________________ 0
the_____________________________________ 0
efficiency______________________________ 0
of______________________________________ 0
the_____________________________________ 0
power___________________________________ 0
amplifier_______________________________ 1
,_______________________________________ 0
we______________________________________ 0
implemented_____________________________ 0
a_______________________________________ 0
feedback________________________________ 0
loop____________________________________ 0
using___________________________________ 0
an______________________________________ 0
op______________________________________ 1
-_______________________________________ 0
amp_____________________________________ 2
and_____________________________________ 0
measured________________________________ 0
the_____________________________________ 0
output__

Applying the tokenize_and_align_labels function to the entire dataset for consistent processing.

In [None]:
tokenized_datasets = electrical_ner_dataset.map(tokenize_and_align_labels, batched=True)

### Saving the Tokenized Dataset

Saving the tokenized dataset to disk for reuse in model training and evaluation.

In [None]:
tokenized_datasets.save_to_disk(OUTPUT_DATASET_PATH)