# HuggingFacifying the Datasets

In [None]:
import sys
from pathlib import Path

from tqdm.notebook import tqdm

SRC_DIRECTORY = Path().cwd().resolve().parent
DATA_DIRECTORY = Path().cwd().resolve().parent.parent / "data"

if str(SRC_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(SRC_DIRECTORY))

import pandas as pd

from datasets import Dataset, DatasetDict

HF_ORGANIZATION = "gtfintechlab"

In [None]:
class LabelMapper:
    def __init__(self, task):
        self.mappings = {
            "finer_ord": {
                0: "other",
                1: "person_b",
                2: "person_i",
                3: "location_b",
                4: "location_i",
                5: "organisation_b",
                6: "organisation_i",
            },
            "fomc_communication": {0: "dovish", 1: "hawkish", 2: "neutral"},
            "numclaim_detection": {0: "outofclaim", 1: "inclaim"},
            "sentiment_analysis": {0: "positive", 1: "negative", 2: "neutral"},
        }
        if task not in self.mappings:
            raise ValueError(f"Task {task} not found in mappings.")
        self.task = task

    def encode(self, label_name):
        reversed_mapping = {v: k for k, v in self.mappings[self.task].items()}
        return reversed_mapping.get(label_name, -1)

    def decode(self, label_number):
        return self.mappings[self.task].get(label_number, "undefined").upper()

---

## FOMC

In [169]:
def huggify_data_fomc(TASK=None, SEED=None, SPLITS=["train", "test"]):
    try:
        mapper = LabelMapper(TASK)
        # Initialize the nested structure
        hf_dataset = DatasetDict()

        # Load data
        for SPLIT in SPLITS:
            # Load and preprocess the dataframe
            data_split = pd.read_excel(
                DATA_DIRECTORY
                / TASK
                / SPLIT
                / f"lab-manual-split-combine-{SPLIT}-{SEED}.xlsx",
                index_col=0,
            )
            data_split.rename(columns={"label": "label_encoded"}, inplace=True)
            data_split["label_decoded"] = data_split["label_encoded"].apply(
                lambda x: mapper.decode(x)
            )

            # Convert the dataframe to Hugging Face's Dataset and store it in the nested dictionary
            hf_dataset[SPLIT] = Dataset.from_pandas(data_split)

        # Push to HF Hub
        hf_dataset.push_to_hub(
            f"{HF_ORGANIZATION}/{TASK}",
            config_name=str(SEED),
            private=True,
        )

    except Exception as e:
        print(e)

In [170]:
SPLITS = ["train", "test"]

TASK = "fomc_communication"

SEEDS = (5768, 78516, 944601)

for SEED in list(reversed(SEEDS)):
    huggify_data_fomc(TASK=TASK, SEED=SEED, SPLITS=SPLITS)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/754 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

## Financial PhraseBank

In [50]:
def make_fpb_hub_datasets(seed=None):
    configs = [
        "sentences_allagree",
        "sentences_75agree",
        "sentences_66agree",
        "sentences_50agree",
    ]

    for config in tqdm(configs, desc="Configs"):
        try:
            fpb_dataset = load_dataset(
                "financial_phrasebank", config, trust_remote_code=True
            )
            config_short = config.replace("sentences_", "")

            texts = fpb_dataset["train"]["sentence"]
            labels = fpb_dataset["train"]["label"]

            splits = {}

            # Splitting the data
            train_texts, test_texts, train_labels, test_labels = train_test_split(
                texts, labels, test_size=0.2, random_state=seed
            )

            # Storing in the dictionary
            splits[seed] = {
                "train": Dataset.from_dict(
                    {
                        "context": train_texts,
                        "response": list(map(decode, train_labels)),
                    }
                ),
                "test": Dataset.from_dict(
                    {"context": test_texts, "response": list(map(decode, test_labels))}
                ),
            }

            # Push to HF Hub
            splits[seed]["train"].push_to_hub(
                f"{ORGANIZATION}/{DATASET}-{config_short}-{seed}",
                config_name="train",
                private=True,
            )
            splits[seed]["test"].push_to_hub(
                f"{ORGANIZATION}/{DATASET}-{config_short}-{seed}",
                config_name="test",
                private=True,
            )

            return splits
        except Exception as e:
            print(f"Error processing config {config}: {str(e)}")

In [51]:
# Execute the function
splits = make_fpb_hub_datasets(SEED)

Configs:   0%|          | 0/4 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/803 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]