In [None]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.modeling import train

[32m2025-02-19 10:00:37.391[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m


# Data Preparation:
Prior to being able to use the dataset in training the model, we must first perpare the data by converting it into a dataset, and tokenizing the textual data (in addition to restructuring the data to a format that can be passed into a model).

### Loading csv as Dataset objects:
Seen below, we are loading the datasets that will be used to train the binary classification model (if discriminatory or not) and the multilabel regression model (the types of discrimination).

In [2]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 230
    })
})

### Extracting separate datasets for binary classification and multilabel regression:

In [12]:
bin_ds, ml_ds = data_preparation.separate_datasets(dataset)

### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [4]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)

bin_lbls, bin_lbl2id, bin_id2lbl = data_preparation.get_labels_and_dicts(bin_ds)
ml_lbls, ml_lbl2id, ml_id2lbl = data_preparation.get_labels_and_dicts(ml_ds)

print(bin_lbls, bin_lbl2id, bin_id2lbl)
print(ml_lbls, ml_lbl2id, ml_id2lbl)

['label'] {'label': 0} {0: 'label'}
['labels'] {'labels': 0} {0: 'labels'}


### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [5]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encode (preprocess) the dataset:

In [6]:
bin_ecd_ds = data_preparation.preprocess_dataset(bin_ds, bin_lbls, tokenizer)
ml_ecd_ds = data_preparation.preprocess_dataset(ml_ds, ml_lbls, tokenizer)

bin_ds_ex = bin_ecd_ds['train'][0]
print(bin_ds_ex)

ml_ds_ex = ml_ecd_ds['train'][0]
print(ml_ds_ex)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

{'TEXT': 'The men are turtles...idiots even in uk… get lost', 'label': tensor(1), 'input_ids': tensor([  101,  1996,  2273,  2024, 16489,  1012,  1012,  1012, 28781,  2130,
         1999,  2866,  1529,  2131,  2439,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     

# Loading/Creating model objects:

In [7]:
bin_class_model = train.get_bin_model()
ml_regress_model = train.get_ml_model()
bin_class_model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Initializing training arguments:

In [8]:
bin_targs = train.bin_train_args()
ml_targs = train.ml_regr_train_args()
bin_targs

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

# Initialize trainer objects with metrics:

In [9]:
bin_trainer = train.Trainer(
    model=bin_class_model,
    args=bin_targs,
    train_dataset=bin_ecd_ds['train'],
    eval_dataset=bin_ecd_ds['test'],
    compute_metrics=train.compute_bin_metrics
)

ml_trainer = train.RegressionTrainer(
    model=ml_regress_model,
    args=ml_targs,
    train_dataset=ml_ecd_ds['train'],
    eval_dataset=ml_ecd_ds['test'],
    compute_metrics=train.compute_reg_metrics
)

bin_trainer

<transformers.trainer.Trainer at 0x1dc20e9bcb0>

# Training:

### Binary Model:

In [10]:
bin_trainer.train()
bin_eval_res = bin_trainer.evaluate()
bin_eval_res

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auprc,Auroc
1,0.6831,0.544973,0.778261,0.912088,0.65873,0.764977,0.363239,0.10989
2,0.4616,0.395738,0.834783,0.805556,0.920635,0.859259,0.356707,0.075855
3,0.239,0.402777,0.821739,0.845528,0.825397,0.835341,0.357046,0.079747


{'eval_loss': 0.39573773741722107,
 'eval_accuracy': 0.8347826086956521,
 'eval_precision': 0.8055555555555556,
 'eval_recall': 0.9206349206349206,
 'eval_f1': 0.8592592592592593,
 'eval_auprc': 0.35670719602278034,
 'eval_auroc': 0.07585470085470086,
 'eval_runtime': 7.2852,
 'eval_samples_per_second': 31.571,
 'eval_steps_per_second': 3.981,
 'epoch': 3.0}

### Multilabel Regression Model:

In [11]:
ml_trainer.train()
ml_eval_res = ml_trainer.evaluate()
ml_eval_res

Epoch,Training Loss,Validation Loss,Mean Rmse,Mean Mae,Mean R2,Mean Pearson,Rmse Per Cat,Mae Per Cat,R2 Per Cat,Pearson Per Cat
1,0.0594,0.062077,0.230201,0.136467,0.065365,0.342573,"[0.2695828080177307, 0.34012848138809204, 0.23656611144542694, 0.24253638088703156, 0.2614489197731018, 0.030942006036639214]","[0.17662456631660461, 0.23207135498523712, 0.15391364693641663, 0.10029660165309906, 0.12967240810394287, 0.0262217465788126]","[0.037137389183044434, 0.11744129657745361, 0.02754795551300049, 0.04310035705566406, 0.1669614315032959, 0.0]","[0.20412179827690125, 0.4112914502620697, 0.18502606451511383, 0.32629096508026123, 0.5861361622810364, nan]"
2,0.0488,0.046188,0.199006,0.131362,0.255497,0.571225,"[0.2513921856880188, 0.2970634400844574, 0.21992449462413788, 0.21736283600330353, 0.16885676980018616, 0.039435241371393204]","[0.1812049001455307, 0.209065243601799, 0.14852143824100494, 0.11788974702358246, 0.10069142282009125, 0.030798928812146187]","[0.16269546747207642, 0.326781690120697, 0.15955305099487305, 0.23143047094345093, 0.6525211334228516, 0.0]","[0.47394859790802, 0.5869887471199036, 0.44627127051353455, 0.5293975472450256, 0.8195171356201172, nan]"
3,0.0488,0.042602,0.192427,0.127315,0.295907,0.616658,"[0.23789304494857788, 0.27766743302345276, 0.21523459255695343, 0.2107299119234085, 0.17168664932250977, 0.04135145619511604]","[0.1617133468389511, 0.20031441748142242, 0.13768887519836426, 0.1344813108444214, 0.0976925641298294, 0.03199771046638489]","[0.25020354986190796, 0.41182374954223633, 0.1950160264968872, 0.2776212692260742, 0.6407766938209534, 0.0]","[0.5384443998336792, 0.6424445509910583, 0.5048655867576599, 0.5616500973701477, 0.8358848094940186, nan]"


{'eval_loss': 0.04260192811489105,
 'eval_mean_rmse': 0.19242717325687408,
 'eval_mean_mae': 0.1273147016763687,
 'eval_mean_r2': 0.2959068715572357,
 'eval_mean_pearson': 0.6166578888893127,
 'eval_rmse_per_cat': [0.23789304494857788,
  0.27766743302345276,
  0.21523459255695343,
  0.2107299119234085,
  0.17168664932250977,
  0.04135145619511604],
 'eval_mae_per_cat': [0.1617133468389511,
  0.20031441748142242,
  0.13768887519836426,
  0.1344813108444214,
  0.0976925641298294,
  0.03199771046638489],
 'eval_r2_per_cat': [0.25020354986190796,
  0.41182374954223633,
  0.1950160264968872,
  0.2776212692260742,
  0.6407766938209534,
  0.0],
 'eval_pearson_per_cat': [0.5384443998336792,
  0.6424445509910583,
  0.5048655867576599,
  0.5616500973701477,
  0.8358848094940186,
  nan],
 'eval_runtime': 7.374,
 'eval_samples_per_second': 31.191,
 'eval_steps_per_second': 3.933,
 'epoch': 3.0}