In [41]:
import datasets
import os
import json

In [42]:
data_path = '/home/ubuntu/nemo_customiser_k8_utils/data'
dataset = datasets.load_dataset(data_path)

In [45]:
# Assuming 'label' is the column you want to stratify by
stratify_column_name = "label"

# Cast the column to ClassLabel
dataset = dataset.class_encode_column(stratify_column_name)

Casting to class labels: 100%|██████████| 153511/153511 [00:00<00:00, 646521.89 examples/s]


In [48]:
dataset

Dataset({
    features: ['headline', 'label'],
    num_rows: 153511
})

In [46]:
DATASET_PROMPT_FORMAT = """Given the following headline:
### START HEADLINE ###

{headline}

### END HEADLINE ###

What event type best classifies it? Choose from the following list:

-analyst rating
-price targets
-earnings
-labour related
-mergers and acquisitions
-dividends
-regulatory
-stock price movement
-credit ratings
-products-services
-product approval
-guidance
-other

Provide only the event type putting it inside double square brackets and in a new line like:
[[label]]

### START EVENT OUTPUT ###

"""

In [49]:
train_ratio = 0.8
val_ratio = 0.15
seed = 42
test_ratio = 1 - train_ratio - val_ratio
save_splits = {}
# dataset = dataset.get('train')
split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=seed, stratify_by_column='label')
split_dataset2 = split_dataset['test'].train_test_split(
    test_size=test_ratio / (val_ratio + test_ratio), seed=seed,stratify_by_column='label')
save_splits['training'] = split_dataset['train']
save_splits['validation'] = split_dataset2['train']
save_splits['test'] = split_dataset2['test']

print("len training: ", len(save_splits['training']))
print("len validation: ", len(save_splits['validation']))
print("len test: ", len(save_splits['test']))

len training:  122808
len validation:  23027
len test:  7676


In [50]:
dataset_root = "data"
os.makedirs(dataset_root, exist_ok=True)

for split_name, dataset in save_splits.items():
    output_file = os.path.join(dataset_root, f"{split_name}.jsonl")
    print(f"Processing {split_name} to {output_file}")
    with open(output_file,"w", encoding="utf-8") as f:
        for example in dataset:
            if DATASET_PROMPT_FORMAT != "":
                _input = DATASET_PROMPT_FORMAT.format(headline=example["headline"])
                _output = example["label"]
            f.write(json.dumps({"prompt": _input, "completion": _output}) + "\n")

    print(f"{split_name} split saved to {output_file}")

Processing training to data/training.jsonl
training split saved to data/training.jsonl
Processing validation to data/validation.jsonl
validation split saved to data/validation.jsonl
Processing test to data/test.jsonl
test split saved to data/test.jsonl


In [35]:
import pandas as pd

In [37]:
df = pd.read_csv(f"{data_path}/nemotron_49b_headline_labels_legal_approved.csv.gz", compression='gzip')

In [39]:
df.groupby('label').size()

label
analyst rating              16945
credit ratings                806
dividends                    2192
earnings                    19989
guidance                     9921
labour issues                3640
mergers and acquisitions     9292
no event                     8753
other                       15890
price targets               19842
product approval             1795
products-services           12892
regulatory                   8829
stock price movement        22725
dtype: int64