In [1]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.config import (
    EXTERNAL_DATA_DIR, 
    CONV_SCHEMA_DIR
)

[32m2025-02-14 15:23:06.881[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m


# Importing third-party datasets for use in NLP model training:

For the purposes of this project, we are going to rely on third-party datasets to make up for a lack of personally procured data. As such, we have implemented some functionality to make this easier for future developers/data analysts.

### Importing datasets from a local source (on your local system):
For the purposes of our applications, we will consider the "raw" datasets to be personally produced datasets rather than those that have already been created ("external"). As such, importing from a local source will by default store the datasets within the data/raw directory. If the data to be imported locally is a third-party dataset, the user can change the 'tp_src' value to True, where the data will be stored within the data/external directory.

In [None]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv", tp_src=False)
print(local_import_example_df)

### Demonstration of importing from remote/external source:
This function facillitates importing data from a given URL (primarily remote repositories like GitHub).

In [None]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
print(remote_import_example_df)

# Conversion of third-party datasets:
Since we are making use of third-party datasets, we need a means of converting the original dataset schema to a format that will utilize our labeling scheme. As such, we have implemented some functionality to facilitate this process.

### Normalizing third-party datasets to a standard format for our applications:
This function facilitates taking one or more datasets (all passed datasets that are to be normalized should maintain the same general structure so that they can be merged prior to normalization).

In [3]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
print(normalized_dataset)

dataset_normalizer.store_normalized_dataset(normalized_dataset, 'ETHOS_dataset_converted')

                                                  TEXT  DISCRIMINATORY  \
0            You should know women's sports are a joke               1   
1      You look like Sloth with deeper Downâ€™s syndrome               1   
2    You look like Russian and speak like Indian. B...               1   
3                 Women deserve to be abused, I guess.               1   
4    Women are made for making babies and cooking d...               1   
..                                                 ...             ...   
993   From the midnight sun where the hot springs blow               0   
994                        Don't say I'm not your type               0   
995   And therefore never send to know for whom the...               0   
996                      And I can't stand another day               0   
997   All values, unless otherwise stated, are in U...               0   

       GENDER      RACE  SEXUALITY  DISABILITY  RELIGION  UNSPECIFIED  
0    1.000000  0.000000        0.0   

# Data Preparation:
Prior to being able to use the dataset in training the model, we must first perpare the data by converting it into a dataset, and tokenizing the textual data (in addition to restructuring the data to a format that can be passed into a model).

### Loading csv as Dataset object:

In [3]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'NEUTRAL', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 698
    })
    test: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'NEUTRAL', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 300
    })
})


### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [4]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)
print(labels)
print(lbl2idx)
print(idx2lbl)

['DISCRIMINATORY', 'NEUTRAL', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED']
{'DISCRIMINATORY': 0, 'NEUTRAL': 1, 'GENDER': 2, 'RACE': 3, 'SEXUALITY': 4, 'DISABILITY': 5, 'RELIGION': 6, 'UNSPECIFIED': 7}
{0: 'DISCRIMINATORY', 1: 'NEUTRAL', 2: 'GENDER', 3: 'RACE', 4: 'SEXUALITY', 5: 'DISABILITY', 6: 'RELIGION', 7: 'UNSPECIFIED'}


### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [5]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


### Encode (preprocess) the dataset:

In [6]:
ecd_dataset = data_preparation.preprocess_dataset(dataset, labels, tokenizer)
dataset_entry_ex = ecd_dataset['train'][0]
print(dataset_entry_ex.keys())

Map:   0%|          | 0/698 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
