In [1]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.modeling import train
from nlpinitiative.config import (
    EXTERNAL_DATA_DIR, 
    CONV_SCHEMA_DIR,
    MODELS_DIR
)

[32m2025-02-15 12:46:48.865[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m


# Importing third-party datasets for use in NLP model training:

For the purposes of this project, we are going to rely on third-party datasets to make up for a lack of personally procured data. As such, we have implemented some functionality to make this easier for future developers/data analysts.

### Importing datasets from a local source (on your local system):
For the purposes of our applications, we will consider the "raw" datasets to be personally produced datasets rather than those that have already been created ("external"). As such, importing from a local source will by default store the datasets within the data/raw directory. If the data to be imported locally is a third-party dataset, the user can change the 'tp_src' value to True, where the data will be stored within the data/external directory.

In [6]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv", tp_src=False)
local_import_example_df

dataset
[32m2025-02-14 15:41:02.815[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_local_source[0m:[36m89[0m - [32m[1mData from file, C:/Users/Daniel/Downloads/dataset.csv, imported[0m


Unnamed: 0,reply_id,hateful_tweet_id,counter_hate_id,Q1,Q2,Q3,Q4
0,1364504065565093894,1364444425192005639,1364503631160954881,1.0,,,1.0
1,1507516585485148164,1507475083954036739,1507490000958820354,1.0,,,0.0
2,976720451148943360,959685036311064576,976558261498515456,1.0,,,0.0
3,976567337078882309,959685036311064576,976558261498515456,1.0,,,0.0
4,1359340305494011907,1359330367157764098,1359339633352478728,1.0,,,1.0
...,...,...,...,...,...,...,...
2616,473289864176078848,473289562903830528,473289697540993024,0.0,0.0,1.0,
2617,475688162468319232,475558857700171776,475565607841054721,1.0,,,0.0
2618,403826060719964160,403818138963173376,403825878603694080,0.0,1.0,0.0,
2619,850094530061901824,850036460111769600,850088912441790468,0.0,0.0,1.0,


### Demonstration of importing from remote/external source:
This function facillitates importing data from a given URL (primarily remote repositories like GitHub).

In [5]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
remote_import_example_df

[32m2025-02-14 15:40:44.073[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mformat_url[0m:[36m45[0m - [1mSource url identified as GitHub URL, https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m
[32m2025-02-14 15:40:44.073[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mformat_url[0m:[36m47[0m - [1mURL Formatted, https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/refs/heads/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m
[32m2025-02-14 15:40:44.073[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_ext_source[0m:[36m101[0m - [1mFile type identified, '.csv'[0m
[32m2025-02-14 15:40:44.385[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_ext_source[0m:[36m114[0m - [32m[1mSuccessfully imported the 

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0
...,...,...
993,From the midnight sun where the hot springs blow,0.0
994,Don't say I'm not your type,0.0
995,And therefore never send to know for whom the...,0.0
996,And I can't stand another day,0.0


# Conversion of third-party datasets:
Since we are making use of third-party datasets, we need a means of converting the original dataset schema to a format that will utilize our labeling scheme. As such, we have implemented some functionality to facilitate this process.

### Normalizing third-party datasets to a standard format for our applications:
This function facilitates taking one or more datasets (all passed datasets that are to be normalized should maintain the same general structure so that they can be merged prior to normalization).

In [4]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
normalized_dataset

Unnamed: 0,TEXT,DISCRIMINATORY,GENDER,RACE,SEXUALITY,DISABILITY,RELIGION,UNSPECIFIED
0,You should know women's sports are a joke,1,1.000000,0.000000,0.0,0.0,0.0,0.0
1,You look like Sloth with deeper Down’s syndrome,1,0.000000,0.000000,0.0,1.0,0.0,0.0
2,You look like Russian and speak like Indian. B...,1,0.142857,0.857143,0.0,0.0,0.0,0.0
3,"Women deserve to be abused, I guess.",1,1.000000,0.000000,0.0,0.0,0.0,0.0
4,Women are made for making babies and cooking d...,1,1.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
993,From the midnight sun where the hot springs blow,0,0.000000,0.000000,0.0,0.0,0.0,0.0
994,Don't say I'm not your type,0,0.000000,0.000000,0.0,0.0,0.0,0.0
995,And therefore never send to know for whom the...,0,0.000000,0.000000,0.0,0.0,0.0,0.0
996,And I can't stand another day,0,0.000000,0.000000,0.0,0.0,0.0,0.0


### Storing the normalized dataset:

In [None]:
dataset_normalizer.store_normalized_dataset(normalized_dataset, 'ETHOS_dataset_converted')

# Data Preparation:
Prior to being able to use the dataset in training the model, we must first perpare the data by converting it into a dataset, and tokenizing the textual data (in addition to restructuring the data to a format that can be passed into a model).

### Loading csv as Dataset objects:
Seen below, we are loading the datasets that will be used to train the binary classification model (if discriminatory or not) and the multilabel regression model (the types of discrimination).

In [2]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 230
    })
})

### Extracting separate datasets for binary classification and multilabel regression:

In [3]:
bin_ds, ml_ds = data_preparation.separate_datasets(dataset)
bin_ds, ml_ds

(DatasetDict({
     train: Dataset({
         features: ['TEXT', 'DISCRIMINATORY'],
         num_rows: 534
     })
     test: Dataset({
         features: ['TEXT', 'DISCRIMINATORY'],
         num_rows: 230
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['TEXT', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
         num_rows: 534
     })
     test: Dataset({
         features: ['TEXT', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
         num_rows: 230
     })
 }))

### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [5]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)

bin_lbls, bin_lbl2id, bin_id2lbl = data_preparation.get_labels_and_dicts(bin_ds)
ml_lbls, ml_lbl2id, ml_id2lbl = data_preparation.get_labels_and_dicts(ml_ds)

print(bin_lbls, bin_lbl2id, bin_id2lbl)
print(ml_lbls, ml_lbl2id, ml_id2lbl)

['DISCRIMINATORY'] {'DISCRIMINATORY': 0} {0: 'DISCRIMINATORY'}
['GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'] {'GENDER': 0, 'RACE': 1, 'SEXUALITY': 2, 'DISABILITY': 3, 'RELIGION': 4, 'UNSPECIFIED': 5} {0: 'GENDER', 1: 'RACE', 2: 'SEXUALITY', 3: 'DISABILITY', 4: 'RELIGION', 5: 'UNSPECIFIED'}


### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [6]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encode (preprocess) the dataset:

In [7]:
bin_ecd_ds = data_preparation.preprocess_dataset(bin_ds, bin_lbls, tokenizer)
ml_ecd_ds = data_preparation.preprocess_dataset(ml_ds, ml_lbls, tokenizer)

bin_ds_ex = bin_ecd_ds['train'][0]
print(bin_ds_ex)

ml_ds_ex = ml_ecd_ds['train'][0]
print(ml_ds_ex)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

{'input_ids': tensor([  101,  2417, 17357, 16841,  2323,  2022,  1037,  2518,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

# Loading/Creating model objects:

In [13]:
bin_class_model = train.get_model(num_lbls=len(bin_lbls), id2lbl_dict=bin_id2lbl, lbl2id_dict=bin_lbl2id)
ml_regress_model = train.get_model(task_type="multi_label_regression", num_lbls=len(ml_lbls), id2lbl_dict=ml_id2lbl, lbl2id_dict=ml_lbl2id)
bin_class_model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Initializing training arguments:

In [12]:
bin_targs = train.training_args(
    output_dir=MODELS_DIR / "binary_classification_models"
)

ml_targs = train.training_args(
    output_dir=MODELS_DIR / "multilabel_regression_models"
)

bin_targs

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_