In [2]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.modeling import train
from nlpinitiative.config import (
    EXTERNAL_DATA_DIR, 
    CONV_SCHEMA_DIR,
    BINARY_LABELS,
    CATEGORY_LABELS
)

from transformers import (
    AutoModelForSequenceClassification
)

# Importing third-party datasets for use in NLP model training:

For the purposes of this project, we are going to rely on third-party datasets to make up for a lack of personally procured data. As such, we have implemented some functionality to make this easier for future developers/data analysts.

### Importing datasets from a local source (on your local system):
For the purposes of our applications, we will consider the "raw" datasets to be personally produced datasets rather than those that have already been created ("external"). As such, importing from a local source will by default store the datasets within the data/raw directory. If the data to be imported locally is a third-party dataset, the user can change the 'tp_src' value to True, where the data will be stored within the data/external directory.

In [None]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv", tp_src=False)
local_import_example_df

### Demonstration of importing from remote/external source:
This function facillitates importing data from a given URL (primarily remote repositories like GitHub).

In [None]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
remote_import_example_df

# Conversion of third-party datasets:
Since we are making use of third-party datasets, we need a means of converting the original dataset schema to a format that will utilize our labeling scheme. As such, we have implemented some functionality to facilitate this process.

### Normalizing third-party datasets to a standard format for our applications:
This function facilitates taking one or more datasets (all passed datasets that are to be normalized should maintain the same general structure so that they can be merged prior to normalization).

In [None]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
normalized_dataset

### Storing the normalized dataset:

In [None]:
dataset_normalizer.store_normalized_dataset(normalized_dataset, 'ETHOS_dataset_converted')

# Data Preparation:
Prior to being able to use the dataset in training the model, we must first perpare the data by converting it into a dataset, and tokenizing the textual data (in addition to restructuring the data to a format that can be passed into a model).

### Loading csv as Dataset objects:
Seen below, we are loading the datasets that will be used to train the binary classification model (if discriminatory or not) and the multilabel regression model (the types of discrimination).

In [3]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 534
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 230
    })
})

### Extracting separate datasets for binary classification and multilabel regression:

In [4]:
bin_ds, ml_ds = data_preparation.separate_datasets(dataset)
bin_ds, ml_ds

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.5, 0.0, 0.5, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.2, 0.8, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.666666667, 0.0, 0.0, 0.333333333, 0.0]
[0.0, 0.5, 0.0, 0.0, 0.5, 0.0]
[0.666666667, 0.0, 0.333333333, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.166666667, 0.0, 0.0, 0.0, 0.833333333, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.142857143, 0.857142857, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.8, 0.2, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.285714286, 0.0, 0.0, 0.857142857, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.75, 0.0, 0.25, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.166666667, 0.0, 0.0, 0.833333333, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.25, 0.0, 0.0, 0.75, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.

(DatasetDict({
     train: Dataset({
         features: ['TEXT', 'label'],
         num_rows: 534
     })
     test: Dataset({
         features: ['TEXT', 'label'],
         num_rows: 230
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['TEXT', 'labels'],
         num_rows: 534
     })
     test: Dataset({
         features: ['TEXT', 'labels'],
         num_rows: 230
     })
 }))

### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [5]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)

bin_lbls, bin_lbl2id, bin_id2lbl = data_preparation.get_labels_and_dicts(bin_ds)
ml_lbls, ml_lbl2id, ml_id2lbl = data_preparation.get_labels_and_dicts(ml_ds)

print(bin_lbls, bin_lbl2id, bin_id2lbl)
print(ml_lbls, ml_lbl2id, ml_id2lbl)

['label'] {'label': 0} {0: 'label'}
['labels'] {'labels': 0} {0: 'labels'}


### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [6]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encode (preprocess) the dataset:

In [7]:
bin_ecd_ds = data_preparation.preprocess_dataset(bin_ds, bin_lbls, tokenizer)
ml_ecd_ds = data_preparation.preprocess_dataset(ml_ds, ml_lbls, tokenizer)

bin_ds_ex = bin_ecd_ds['train'][0]
print(bin_ds_ex)

ml_ds_ex = ml_ecd_ds['train'][0]
print(ml_ds_ex)

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

{'TEXT': 'Buy lethal guns ... Real ones, not toys!', 'label': tensor(0), 'input_ids': tensor([  101,  4965, 12765,  4409,  1012,  1012,  1012,  2613,  3924,  1010,
         2025, 10899,   999,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,

# Loading/Creating model objects:

In [8]:
bin_class_model = train.get_model()
ml_regress_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=6,
)
bin_class_model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Initializing training arguments:

In [9]:
bin_targs = train.bin_train_args()
ml_targs = train.ml_regr_train_args()
bin_targs

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

# Initialize trainer objects with metrics:

In [10]:
bin_trainer = train.Trainer(
    model=bin_class_model,
    args=bin_targs,
    train_dataset=bin_ecd_ds['train'],
    eval_dataset=bin_ecd_ds['test'],
    compute_metrics=train.compute_bin_metrics
)

ml_trainer = train.RegressionTrainer(
    model=ml_regress_model,
    args=ml_targs,
    train_dataset=ml_ecd_ds['train'],
    eval_dataset=ml_ecd_ds['test'],
    compute_metrics=train.compute_reg_metrics
)

bin_trainer

<transformers.trainer.Trainer at 0x1a29959a570>

# Training:

### Binary Model:

In [12]:
bin_trainer.train()
bin_eval_res = bin_trainer.evaluate()
bin_eval_res

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auprc,Auroc
1,0.5111,0.491705,0.765217,0.767606,0.838462,0.801471,0.385779,0.14
2,0.2574,0.423067,0.808696,0.84127,0.815385,0.828125,0.377438,0.104
3,0.1283,0.427709,0.821739,0.880342,0.792308,0.834008,0.37556,0.094923


{'eval_loss': 0.42770877480506897,
 'eval_accuracy': 0.8217391304347826,
 'eval_precision': 0.8803418803418803,
 'eval_recall': 0.7923076923076923,
 'eval_f1': 0.8340080971659919,
 'eval_auprc': 0.3755603747651698,
 'eval_auroc': 0.09492307692307693,
 'eval_runtime': 7.228,
 'eval_samples_per_second': 31.821,
 'eval_steps_per_second': 4.012,
 'epoch': 3.0}

### Multilabel Regression Model:

In [11]:
ml_trainer.train()
ml_eval_res = ml_trainer.evaluate()
ml_eval_res

Epoch,Training Loss,Validation Loss,Mean Rmse,Mean Mae,Mean R2,Mean Pearson,Rmse Per Cat,Mae Per Cat,R2 Per Cat,Pearson Per Cat
1,0.0711,0.063997,0.23292,0.140595,0.040973,0.281628,"[0.254986047744751, 0.3465014696121216, 0.2630208730697632, 0.23720261454582214, 0.26977255940437317, 0.026036713272333145]","[0.17647646367549896, 0.25872179865837097, 0.13634462654590607, 0.0958322212100029, 0.1555357724428177, 0.02065996080636978]","[0.08757269382476807, 0.14866852760314941, -0.006066441535949707, -0.039989590644836426, 0.0556522011756897, 0.0]","[0.30391204357147217, 0.47104179859161377, 0.1522805392742157, 0.08596348017454147, 0.3949446380138397, nan]"
2,0.0586,0.056602,0.221741,0.149093,0.115549,0.400855,"[0.23982465267181396, 0.3065391480922699, 0.2557942271232605, 0.23585782945156097, 0.2564680278301239, 0.03595989570021629]","[0.1569744348526001, 0.23641261458396912, 0.16298368573188782, 0.1334248185157776, 0.1761503517627716, 0.028611140325665474]","[0.19285213947296143, 0.3337143659591675, 0.04845839738845825, -0.028230905532836914, 0.1465010643005371, 0.0]","[0.4697819948196411, 0.6133527755737305, 0.30025503039360046, 0.09799902141094208, 0.5228886604309082, nan]"
3,0.0531,0.052233,0.21472,0.137086,0.15876,0.462011,"[0.2302805334329605, 0.28118592500686646, 0.25040900707244873, 0.2395753264427185, 0.24357284605503082, 0.043293774127960205]","[0.1534026861190796, 0.20950518548488617, 0.1494540125131607, 0.12927226722240448, 0.14616164565086365, 0.03472209349274635]","[0.25581681728363037, 0.43937087059020996, 0.08810210227966309, -0.06089949607849121, 0.2301710844039917, 0.0]","[0.5154060125350952, 0.6746363639831543, 0.39871376752853394, 0.13309906423091888, 0.5882002711296082, nan]"


{'eval_loss': 0.05223297327756882,
 'eval_mean_rmse': 0.21471957862377167,
 'eval_mean_mae': 0.13708631694316864,
 'eval_mean_r2': 0.15876023471355438,
 'eval_mean_pearson': 0.4620110958814621,
 'eval_rmse_per_cat': [0.2302805334329605,
  0.28118592500686646,
  0.25040900707244873,
  0.2395753264427185,
  0.24357284605503082,
  0.043293774127960205],
 'eval_mae_per_cat': [0.1534026861190796,
  0.20950518548488617,
  0.1494540125131607,
  0.12927226722240448,
  0.14616164565086365,
  0.03472209349274635],
 'eval_r2_per_cat': [0.25581681728363037,
  0.43937087059020996,
  0.08810210227966309,
  -0.06089949607849121,
  0.2301710844039917,
  0.0],
 'eval_pearson_per_cat': [0.5154060125350952,
  0.6746363639831543,
  0.39871376752853394,
  0.13309906423091888,
  0.5882002711296082,
  nan],
 'eval_runtime': 7.4227,
 'eval_samples_per_second': 30.986,
 'eval_steps_per_second': 3.907,
 'epoch': 3.0}