In [1]:
import json
from nlpinitiative.config import PROCESSED_DATA_DIR
from nlpinitiative.data_preparation.data_process import DataProcessor
from nlpinitiative.data_preparation.data_management import DataManager

[32m2025-03-24 18:48:35.055[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m


# DataProcessor Creation:

In [2]:
dp = DataProcessor()

# Dataset loading:

### Loading csv as Dataset objects:
Seen below, we are loading the datasets that will be used to train the binary classification model (if discriminatory or not) and the multilabel regression model (the types of discrimination).

In [3]:
dataset = dp.dataset_from_file('NLPinitiative_Master_Dataset.csv', PROCESSED_DATA_DIR)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 230
    })
})

### Extracting separate datasets for binary classification and multilabel regression:

In [5]:
bin_ds, ml_ds = dp.bin_ml_dataset_split(dataset)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [6]:
bin_ds_metadata = dp.get_dataset_metadata(bin_ds)
ml_ds_metadata = dp.get_dataset_metadata(ml_ds)

print(json.dumps(bin_ds_metadata, indent=4))
print(json.dumps(ml_ds_metadata, indent=4))

{
    "labels": [
        "label"
    ],
    "lbl2idx": {
        "label": 0
    },
    "idx2lbl": {
        "0": "label"
    }
}
{
    "labels": [
        "labels"
    ],
    "lbl2idx": {
        "labels": 0
    },
    "idx2lbl": {
        "0": "labels"
    }
}


### Initialization of a tokenizer (BERT by default):

In [7]:
tokenizer = dp.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encode (preprocess) the dataset:

In [8]:
bin_ecd_ds = dp.preprocess(bin_ds, bin_ds_metadata['labels'], tokenizer)
ml_ecd_ds = dp.preprocess(ml_ds, ml_ds_metadata['labels'], tokenizer)

bin_ds_ex = bin_ecd_ds['train'][0]
print(bin_ds_ex)

ml_ds_ex = ml_ecd_ds['train'][0]
print(ml_ds_ex)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

{'TEXT': 'The male gene is stronger, as it should be of course. Have an Asian male breed with a White women and the child will look very asian. Reverse it and the child will look mush more whiter', 'label': tensor(1), 'input_ids': tensor([  101,  1996,  3287,  4962,  2003,  6428,  1010,  2004,  2009,  2323,
         2022,  1997,  2607,  1012,  2031,  2019,  4004,  3287,  8843,  2007,
         1037,  2317,  2308,  1998,  1996,  2775,  2097,  2298,  2200,  4004,
         1012,  7901,  2009,  1998,  1996,  2775,  2097,  2298, 14163,  4095,
         2062,  2317,  2099,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,    

# Complete Data Preprocessing using DataManager:

In [None]:
dm = DataManager()
bin_data_obj, ml_data_obj = dm.prepare_and_preprocess_dataset('NLPinitiative_Master_Dataset.csv', PROCESSED_DATA_DIR)

# How attributes are accessed
print(bin_data_obj.raw_dataset)
print(bin_data_obj.encoded_dataset)
print(bin_data_obj.labels)
print(bin_data_obj.idx2lbl)
print(bin_data_obj.lbl2idx)
print(bin_data_obj.tokenizer)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'label'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'label'],
        num_rows: 230
    })
})
DatasetDict({
    train: Dataset({
        features: ['TEXT', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 230
    })
})
['label']
{0: 'label'}
{'label': 0}
BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, l