In [4]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [5]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
stream_df = stream_df[["text", "label"]]

train_df, temp_df = train_test_split(
    stream_df, 
    test_size=0.3,
    stratify=stream_df["label"],
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

stream_data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(stream_df["label"].unique())
class_label = ClassLabel(names=labels)

stream_data = stream_data.cast_column("label", class_label)

stream_data.save_to_disk("data/processed/stream_data")


Casting the dataset:   0%|          | 0/7800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1671 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1672 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1672 [00:00<?, ? examples/s]

In [6]:
val_df["label"].value_counts()


label
cs          712
math        287
cond-mat    119
physics     106
astro-ph    105
quant-ph     78
eess         72
hep          68
stat         34
gr-qc        29
nucl         15
q-bio        13
econ         12
math-ph       7
q-fin         7
nlin          7
Name: count, dtype: int64