In [2]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.config import (
    RAW_DATA_DIR, 
    EXTERNAL_DATA_DIR, 
    INTERIM_DATA_DIR, 
    CONV_SCHEMA_DIR, 
    DATASET_COLS
)

[32m2025-02-08 12:59:26.570[0m | [1mINFO    [0m | [36mnlpinitiative.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Daniel\Desktop\GitHub\NLPinitiative[0m
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Demonstration of importing from local source:

In [3]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv")
print(local_import_example_df)

dataset
[32m2025-02-08 12:59:29.200[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_local_source[0m:[36m89[0m - [32m[1mData from file, C:/Users/Daniel/Downloads/dataset.csv, imported[0m
                 reply_id     hateful_tweet_id      counter_hate_id   Q1   Q2  \
0     1364504065565093894  1364444425192005639  1364503631160954881  1.0  NaN   
1     1507516585485148164  1507475083954036739  1507490000958820354  1.0  NaN   
2      976720451148943360   959685036311064576   976558261498515456  1.0  NaN   
3      976567337078882309   959685036311064576   976558261498515456  1.0  NaN   
4     1359340305494011907  1359330367157764098  1359339633352478728  1.0  NaN   
...                   ...                  ...                  ...  ...  ...   
2616   473289864176078848   473289562903830528   473289697540993024  0.0  0.0   
2617   475688162468319232   475558857700171776   475565607841054721  1.0  NaN   
2618   403826060719964160  

### Demonstration of importing from remote source:

In [4]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
print(remote_import_example_df)

[32m2025-02-08 12:59:31.892[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mformat_url[0m:[36m45[0m - [1mSource url identified as GitHub URL, https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m
[32m2025-02-08 12:59:31.893[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mformat_url[0m:[36m47[0m - [1mURL Formatted, https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/refs/heads/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m
[32m2025-02-08 12:59:31.893[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_ext_source[0m:[36m101[0m - [1mFile type identified, '.csv'[0m
[32m2025-02-08 12:59:31.965[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.data_preparation.data_import[0m:[36mimport_from_ext_source[0m:[36m114[0m - [32m[1mSuccessfully imported the 

### Normalizing thirdparty datasets to a standard format for our applications:

In [5]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
print(normalized_dataset)

                                                  TEXT  DISCRIMINATORY  \
0            You should know women's sports are a joke             1.0   
1      You look like Sloth with deeper Down’s syndrome             1.0   
2    You look like Russian and speak like Indian. B...             1.0   
3                 Women deserve to be abused, I guess.             1.0   
4    Women are made for making babies and cooking d...             1.0   
..                                                 ...             ...   
993   From the midnight sun where the hot springs blow             0.0   
994                        Don't say I'm not your type             0.0   
995   And therefore never send to know for whom the...             0.0   
996                      And I can't stand another day             0.0   
997   All values, unless otherwise stated, are in U...             0.0   

     NEUTRAL    GENDER      RACE  SEXUALITY  DISABILITY  RELIGION  UNSPECIFIED  
0        0.0  1.000000  0.0000

### Loading csv as Dataset object:

In [6]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'DISCRIMINATORY', 'NEUTRAL', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED'],
        num_rows: 998
    })
})


### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [7]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)
print(labels)
print(lbl2idx)
print(idx2lbl)

['DISCRIMINATORY', 'NEUTRAL', 'GENDER', 'RACE', 'SEXUALITY', 'DISABILITY', 'RELIGION', 'UNSPECIFIED']
{'DISCRIMINATORY': 0, 'NEUTRAL': 1, 'GENDER': 2, 'RACE': 3, 'SEXUALITY': 4, 'DISABILITY': 5, 'RELIGION': 6, 'UNSPECIFIED': 7}
{0: 'DISCRIMINATORY', 1: 'NEUTRAL', 2: 'GENDER', 3: 'RACE', 4: 'SEXUALITY', 5: 'DISABILITY', 6: 'RELIGION', 7: 'UNSPECIFIED'}


### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [15]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[101, 5604, 19204, 17629, 2005, 17181, 102]
[CLS] testing tokenizer for encoding [SEP]


In [18]:
ecd_dataset = data_preparation.preprocess_dataset(dataset, labels, tokenizer)
dataset_entry_ex = ecd_dataset['train'][0]
print(dataset_entry_ex.keys())

TypeError: preprocess_dataset() takes 2 positional arguments but 3 were given