# Data Preprocessing and Tokenization

In [1]:
import json
from nlpinitiative.config import PROCESSED_DATA_DIR
from nlpinitiative.data_preparation.data_process import DataProcessor
from nlpinitiative.data_preparation.data_management import DataManager

[32m2025-04-11 19:13:07.423[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m18[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m
[32m2025-04-11 19:13:07.424[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m25[0m - [1mLoading pyproject.toml...[0m
[32m2025-04-11 19:13:07.425[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m61[0m - [32m[1mpyproject.toml loaded successfully.[0m


## DataProcessor Creation

In [2]:
dp = DataProcessor()

## Dataset Loading

### Loading Dataset CSV Files as DatasetDict Objects 
Seen below, we are loading the datasets that will be used to train the binary classification model (if discriminatory or not) and the multilabel regression model (the types of discrimination).

In [3]:
dataset = dp.dataset_from_file('NLPinitiative_Master_Dataset.csv', PROCESSED_DATA_DIR)
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 7352
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 3152
    })
})

### Splitting the Dataset into Binary and Multilabel Training/Testing Splits

In [4]:
bin_ds, ml_ds = dp.bin_ml_dataset_split(dataset)

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

### Metadata Extraction (label array and dictionaries for mapping labels to indices and indices to labels)

In [5]:
bin_ds_metadata = dp.get_dataset_metadata(bin_ds)
ml_ds_metadata = dp.get_dataset_metadata(ml_ds)

print(json.dumps(bin_ds_metadata, indent=4))
print(json.dumps(ml_ds_metadata, indent=4))

{
    "labels": [
        "label"
    ],
    "lbl2idx": {
        "label": 0
    },
    "idx2lbl": {
        "0": "label"
    }
}
{
    "labels": [
        "labels"
    ],
    "lbl2idx": {
        "labels": 0
    },
    "idx2lbl": {
        "0": "labels"
    }
}


### Initializing Tokenizer Objexts (BERT Tokenizers By Default)

In [6]:
tokenizer = dp.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encoding (Preprocessing/Tokenizing) the Dataset

In [7]:
bin_ecd_ds = dp.preprocess(bin_ds, bin_ds_metadata['labels'], tokenizer)
ml_ecd_ds = dp.preprocess(ml_ds, ml_ds_metadata['labels'], tokenizer)

bin_ds_ex = bin_ecd_ds['train'][0]
print(bin_ds_ex)

ml_ds_ex = ml_ecd_ds['train'][0]
print(ml_ds_ex)

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

{'TEXT': 'Immigrants take our jobs', 'label': tensor(1), 'input_ids': tensor([ 101, 7489, 2202, 2256, 5841,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Data Preprocessing/Tokenization Using the DataManager Class

In [8]:
dm = DataManager()
bin_data_obj, ml_data_obj = dm.prepare_and_preprocess_dataset('NLPinitiative_Master_Dataset.csv', PROCESSED_DATA_DIR)

# How attributes are accessed
print(bin_data_obj.raw_dataset)
print(bin_data_obj.encoded_dataset)
print(bin_data_obj.labels)
print(bin_data_obj.idx2lbl)
print(bin_data_obj.lbl2idx)
print(bin_data_obj.tokenizer)

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Map:   0%|          | 0/3152 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'label'],
        num_rows: 7352
    })
    test: Dataset({
        features: ['TEXT', 'label'],
        num_rows: 3152
    })
})
DatasetDict({
    train: Dataset({
        features: ['TEXT', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7352
    })
    test: Dataset({
        features: ['TEXT', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3152
    })
})
['label']
{0: 'label'}
{'label': 0}
BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=Fals