In [2]:
import json
from nlpinitiative.data_preparation.data_management import DataManager
from nlpinitiative.config import (
    INTERIM_DATA_DIR,
    RAW_DATA_DIR
)

# Initialization of Data Manager:
This is used for managing all dataset operations.

In [3]:
data_manager = DataManager()

# Importing third-party datasets for use in NLP model training:

For the purposes of this project, we are going to rely on third-party datasets to make up for a lack of personally procured data. As such, we have implemented some functionality to make this easier for future developers/data analysts.

### Importing datasets from a local source (on your local system):
For the purposes of our applications, we will consider the "raw" datasets to be personally produced datasets rather than those that have already been created ("external"). As such, importing from a local source will by default store the datasets within the data/raw directory. If the data to be imported locally is a third-party dataset, the user can change the 'tp_src' value to True, where the data will be stored within the data/external directory.

In [None]:
local_import_example_df = data_manager.import_data(import_type='local', source="C:/Users/Daniel/Downloads/dataset.csv", dataset_name='TestDS', is_third_party=False, overwrite=True)
data_manager.remove_file(filename='dataset.csv', path=RAW_DATA_DIR)
local_import_example_df

### Demonstration of importing from remote/external source:
This function facillitates importing data from a given URL (primarily remote repositories like GitHub).

In [3]:
remote_import_example_df1 = data_manager.import_data(import_type='external', source="https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv", dataset_name='EthosBinDS', overwrite=True)
remote_import_example_df1

[32m2025-03-19 16:45:15.721[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_management[0m:[36m_format_url[0m:[36m82[0m - [1mSource url identified as GitHub URL, https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/blob/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m
[32m2025-03-19 16:45:15.721[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_management[0m:[36m_format_url[0m:[36m84[0m - [1mURL Formatted, https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/refs/heads/master/ethos/ethos_data/Ethos_Dataset_Binary.csv[0m


Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0
...,...,...
993,From the midnight sun where the hot springs blow,0.0
994,Don't say I'm not your type,0.0
995,And therefore never send to know for whom the...,0.0
996,And I can't stand another day,0.0


In [4]:
remote_import_example_df2 = data_manager.import_data(import_type='external', source="https://github.com/marcoguerini/CONAN/blob/master/Multitarget-CONAN/Multitarget-CONAN.csv", dataset_name='Multitarget-CONAN-DS', overwrite=True)
remote_import_example_df2

[32m2025-03-19 16:45:19.389[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_management[0m:[36m_format_url[0m:[36m82[0m - [1mSource url identified as GitHub URL, https://github.com/marcoguerini/CONAN/blob/master/Multitarget-CONAN/Multitarget-CONAN.csv[0m
[32m2025-03-19 16:45:19.389[0m | [1mINFO    [0m | [36mnlpinitiative.data_preparation.data_management[0m:[36m_format_url[0m:[36m84[0m - [1mURL Formatted, https://raw.githubusercontent.com/marcoguerini/CONAN/refs/heads/master/Multitarget-CONAN/Multitarget-CONAN.csv[0m


Unnamed: 0,INDEX,HATE_SPEECH,COUNTER_NARRATIVE,TARGET,VERSION
0,0,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma...",other,V1
1,1,‘Juice (Jews) are worse that nukes.’ Caption t...,You'd rather be nuked then?,JEWS,V1
2,2,Foreigners on UK benefits leaps 41% in 5 years...,You seem to be worried about government expend...,MIGRANTS,V1
3,3,"Jews cannot be patriots, since their allegianc...",You can have parents and grandparents born els...,JEWS,V1
4,4,"Israel, Zionism and Jewish Supremacist control...",Wouldn't you say its colonialism from many dif...,JEWS,V1
...,...,...,...,...,...
4998,4998,Woman are basically childlike.,They are not childlike and are capable of grea...,WOMEN,V6_mix
4999,4999,women should be submissive to men.,"There are many women who want to be ""submissiv...",WOMEN,V6_mix
5000,5000,"Women are essentially childlike, unable to und...",Maybe you should try talking to a woman instea...,WOMEN,V6_mix
5001,5001,Women should be submissive to men.,Women are human beings deserving respect and e...,WOMEN,V6_mix


# Conversion of third-party datasets:
Since we are making use of third-party datasets, we need a means of converting the original dataset schema to a format that will utilize our labeling scheme. As such, we have implemented some functionality to facilitate this process.

### Normalizing third-party datasets to a standard format for our applications:
This function facilitates taking one or more datasets (all passed datasets that are to be normalized should maintain the same general structure so that they can be merged prior to normalization).

In [3]:
files = [
    "marcoguerini_CONAN_Multitarget-CONAN.csv"
]
conv = "mutltitarget_conan_mapping.json"

normalized_dataset = data_manager.normalize_dataset(files, conv, 'Multitarget_CONAN_normalized')
normalized_dataset

[32m2025-03-19 16:50:26.870[0m | [32m[1mSUCCESS [0m | [36mnlpinitiative.data_preparation.data_management[0m:[36mnormalize_dataset[0m:[36m217[0m - [32m[1mSuccessfully normalized dataset files [marcoguerini_CONAN_Multitarget-CONAN.csv][0m


Unnamed: 0,TEXT,DISCRIMINATORY,GENDER,RACE,SEXUALITY,DISABILITY,RELIGION,UNSPECIFIED
0,‘Juice (Jews) are worse that nukes.’ Caption t...,1,0.0,0.5,0.0,0.0,0.5,0.0
1,Foreigners on UK benefits leaps 41% in 5 years...,1,0.0,1.0,0.0,0.0,0.0,0.0
2,"Jews cannot be patriots, since their allegianc...",1,0.0,0.5,0.0,0.0,0.5,0.0
3,"Israel, Zionism and Jewish Supremacist control...",1,0.0,0.5,0.0,0.0,0.5,0.0
4,"Women are basically childlike, they remain thi...",1,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
4732,Woman are basically childlike.,1,1.0,0.0,0.0,0.0,0.0,0.0
4733,women should be submissive to men.,1,1.0,0.0,0.0,0.0,0.0,0.0
4734,"Women are essentially childlike, unable to und...",1,1.0,0.0,0.0,0.0,0.0,0.0
4735,Women should be submissive to men.,1,1.0,0.0,0.0,0.0,0.0,0.0


# Generating a new Master Dataset from all normalized datasets:

In [5]:
data_manager.build_master_dataset()

# Getting Dataset Statistics from Dataset:

This can be helpful to determine if the dataset may have imbalances between categories, and can be helpful for applying weights depending on these imbalances.

In [None]:
ds_stats = data_manager.get_dataset_statistics(INTERIM_DATA_DIR / 'ETHOS_dataset_normalized.csv')
print(json.dumps(ds_stats, indent=4))

# Upload and download of dataset repo:

In [None]:
token = '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'
data_manager.push_dataset_dir(token)

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/datasets/dlsmallw/NLPinitiative-Dataset/preupload/main (Request ID: Root=1-67e61cfd-7b66d4a024a950e0513ff82b;5f89e177-adb0-42b3-bf0a-84c1cbb592b7)

Invalid credentials in Authorization header

In [None]:
data_manager.pull_dataset_repo(token)