In [None]:
from nlpinitiative.data_preparation import data_import, dataset_normalizer
from nlpinitiative.config import (
    EXTERNAL_DATA_DIR, 
    CONV_SCHEMA_DIR,
)

# Importing third-party datasets for use in NLP model training:

For the purposes of this project, we are going to rely on third-party datasets to make up for a lack of personally procured data. As such, we have implemented some functionality to make this easier for future developers/data analysts.

### Importing datasets from a local source (on your local system):
For the purposes of our applications, we will consider the "raw" datasets to be personally produced datasets rather than those that have already been created ("external"). As such, importing from a local source will by default store the datasets within the data/raw directory. If the data to be imported locally is a third-party dataset, the user can change the 'tp_src' value to True, where the data will be stored within the data/external directory.

In [None]:
local_import_example_df = data_import.import_from_local_source("C:/Users/Daniel/Downloads/dataset.csv", tp_src=False)
local_import_example_df

### Demonstration of importing from remote/external source:
This function facillitates importing data from a given URL (primarily remote repositories like GitHub).

In [None]:
remote_import_example_df = data_import.import_from_ext_source("https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv")
remote_import_example_df

# Conversion of third-party datasets:
Since we are making use of third-party datasets, we need a means of converting the original dataset schema to a format that will utilize our labeling scheme. As such, we have implemented some functionality to facilitate this process.

### Normalizing third-party datasets to a standard format for our applications:
This function facilitates taking one or more datasets (all passed datasets that are to be normalized should maintain the same general structure so that they can be merged prior to normalization).

In [None]:
srcs = [
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Binary.csv",
    EXTERNAL_DATA_DIR / "intelligence-csd-auth-gr_Ethos-Hate-Speech-Dataset_Ethos_Dataset_Multi_Label.csv"
]
conv = CONV_SCHEMA_DIR / "ethos_schema_mapping.json"

normalized_dataset = dataset_normalizer.convert_to_master_schema(srcs, conv, 'ETHOS_dataset_converted')
normalized_dataset

### Storing the normalized dataset:

In [None]:
dataset_normalizer.store_normalized_dataset(normalized_dataset, 'ETHOS_dataset_converted')