In [1]:
from datasets import load_dataset
from huggingface_hub import HfFolder, login
import pandas as pd

In [None]:
with open(".hf_token", 'r') as token:
    login(token.read())

In [3]:
dataset_id = "gtfintechlab/fomc_communication"
repository_id = "TextCEsInFinance/fomc_dataset"

In [None]:
# Load dataset
dataset = load_dataset(dataset_id, encoding='utf-8')

# Training and testing datasets
train_dataset = dataset['train'].to_pandas()
test_dataset = dataset['test'].to_pandas()

In [5]:
columns = ['sentence', 'year', 'label']

train_dataset = train_dataset[columns]
test_dataset = test_dataset[columns]

In [None]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names

# LABEL_2: Neutral
# LABEL_1: Hawkish
# LABEL_0: Dovish

num_labels = 3
class_names = ['dovish', 'hawkish', 'neutral'] # hardcoded because not included in repo
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

In [7]:
# Assign column for text label
train_dataset = train_dataset.assign(text_label=train_dataset['label'].apply(lambda x: id2label[x]))
test_dataset = test_dataset.assign(text_label=test_dataset['label'].apply(lambda x: id2label[x]))

# Rename text column
train_dataset = train_dataset.rename(columns={"sentence": "text"})
test_dataset = test_dataset.rename(columns={"sentence": "text"})

In [None]:
train_dataset = train_dataset.drop_duplicates(subset=['text'], keep='last')
test_dataset = test_dataset.drop_duplicates(subset=['text'], keep='last')

# Check for duplicate sentences in the dataset
duplicates = pd.merge(train_dataset, test_dataset, how='inner', on=['text'])['text']

# Drop duplicate sentences
train_dataset = train_dataset[~train_dataset['text'].isin(duplicates)]

# Check if no more duplicates exist
assert pd.merge(train_dataset, test_dataset, how='inner', on=['text'])['text'].empty
assert len(train_dataset) == len(pd.unique(train_dataset['text']))
assert len(test_dataset) == len(pd.unique(test_dataset['text']))
print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

In [9]:
train_dataset.to_csv('./data/fomc/fomc_train_preprocessed.csv', index_label='index')
test_dataset.to_csv('./data/fomc/fomc_test_preprocessed.csv', index_label='index')

In [10]:
# Assign targets for counterfactual generation

test_hawkish = test_dataset[test_dataset['text_label'] == 'hawkish']
test_dovish = test_dataset[test_dataset['text_label'] == 'dovish']
test_neutral = test_dataset[test_dataset['text_label'] == 'neutral']

split = int(len(test_neutral) / 2)

# Half of the neutral sentences will be transformed into dovish, rest into hawkish 

test_neutral_to_hawkish = test_neutral.iloc[split:]
test_neutral_to_dovish = test_neutral.iloc[:split]

In [11]:
# Assign labels

test_hawkish = test_hawkish.assign(target=2)
test_dovish = test_dovish.assign(target=2)

test_neutral_to_hawkish = test_neutral_to_hawkish.assign(target=1)
test_neutral_to_dovish = test_neutral_to_dovish.assign(target=0)

In [14]:
test_targets = pd.concat([test_hawkish, test_dovish, test_neutral_to_hawkish, test_neutral_to_dovish])
test_targets = test_targets.sample(frac=1).reset_index(drop=True)

test_targets.to_csv('./data/fomc/test_with_targets.csv', index_label='index', encoding='utf-8')

In [None]:
for i in range(3):
    print(i, (i + 1)%3, (i+2)%3)

In [None]:
# Load dataset
dataset = load_dataset('TextCEsInFinance/fomc-communication-counterfactual', encoding='utf-8')

# Training and testing datasets
test_dataset = dataset['test'].to_pandas()

In [None]:
test_dataset.iloc[340]

In [19]:
test_hawkish = test_dataset[test_dataset['text_label'] == 'hawkish']
test_dovish = test_dataset[test_dataset['text_label'] == 'dovish']
test_neutral = test_dataset[test_dataset['text_label'] == 'neutral']

In [None]:
print(len(test_hawkish), len(test_dovish), len(test_neutral))

In [32]:
split = int(len(test_hawkish)/2)
test_hawkish_0 = test_hawkish.iloc[split:].assign(target=0)
test_hawkish_2 = test_hawkish.iloc[:split]

In [33]:
split = int(len(test_dovish)/2)
test_dovish_1 = test_dovish.iloc[split:].assign(target=1)
test_dovish_2 = test_dovish.iloc[:split]

In [None]:
full = pd.concat([test_neutral, test_hawkish_0, test_hawkish_2, test_dovish_1, test_dovish_2]).sort_values('index')
dovish_1 = full[(full['label'] == 0) & (full['target'] == 1)]
dovish_2 = full[(full['label'] == 0) & (full['target'] == 2)]
hawkish_0 = full[(full['label'] == 1) & (full['target'] == 0)]
hawkish_2 = full[(full['label'] == 1) & (full['target'] == 2)]
neutral_0 = full[(full['label'] == 2) & (full['target'] == 0)]
neutral_1 = full[(full['label'] == 2) & (full['target'] == 1)]

print(len(dovish_1), len(dovish_2), len(hawkish_0), len(hawkish_2), len(neutral_0), len(neutral_1))

In [45]:
full.to_csv('full.csv', encoding='utf-8')

In [None]:
full