In [None]:
from nlpinitiative.data_preparation import data_import, data_preparation, dataset_normalizer
from nlpinitiative.config import (
    RAW_DATA_DIR, 
    EXTERNAL_DATA_DIR, 
    INTERIM_DATA_DIR, 
    CONV_SCHEMA_DIR, 
    DATASET_COLS
)

### Demonstration of importing from local source:

In [None]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv")
print(local_import_example_df)

### Demonstration of importing from remote source:

In [None]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
print(remote_import_example_df)

### Normalizing thirdparty datasets to a standard format for our applications:

In [None]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
print(normalized_dataset)

### Loading csv as Dataset object:

In [None]:
dataset = data_preparation.get_dataset_from_file("ETHOS_dataset_converted.csv")
print(dataset)

### Extracting labels and initializing dicts for converting from labels to ids and ids to labels:

In [None]:
labels, lbl2idx, idx2lbl = data_preparation.get_labels_and_dicts(dataset)
print(labels)
print(lbl2idx)
print(idx2lbl)

### Initialization of a tokenizer (using pre-trained BERT tokenizer)

In [None]:
tokenizer = data_preparation.get_tokenizer()
encoded_text = tokenizer.encode("Testing tokenizer for encoding")
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

### Encode (preprocess) the dataset:

In [None]:
ecd_dataset = data_preparation.preprocess_dataset(dataset, labels, tokenizer)
dataset_entry_ex = ecd_dataset['train'][0]
print(dataset_entry_ex.keys())