In [1]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [2]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
stream_df = stream_df[["text", "label"]]


In [3]:
stream_df["label"].value_counts()


label
cs          4747
math        1913
cond-mat     795
physics      711
astro-ph     700
quant-ph     521
eess         485
stat         229
hep-ph       223
gr-qc        190
hep-th       154
q-bio         86
econ          79
nucl-th       72
hep-ex        50
nlin          47
math-ph       43
q-fin         43
nucl-ex       29
hep-lat       26
Name: count, dtype: int64

In [4]:
aux_data = load_dataset("real-jiakai/arxiver-with-category")

aux_data.set_format(type="pandas")
aux_df = aux_data["train"][:]
aux_df["label"] = aux_df["primary_category"].apply(map_category)
aux_df["text"] = aux_df["title"] + "\n" + aux_df["abstract"]
aux_df = aux_df[["text", "label"]]


In [5]:
aux_df["label"].value_counts()


label
cs          26733
math         9611
cond-mat     4660
astro-ph     4453
physics      4163
quant-ph     2930
eess         2839
hep-ph       1642
stat         1546
gr-qc        1187
hep-th        918
q-bio         677
nucl-th       412
math-ph       365
econ          288
nlin          262
q-fin         231
hep-ex        203
hep-lat       137
nucl-ex       100
Name: count, dtype: int64

In [20]:
# sub_aux_df = aux_df.isin({"label": ["econ"]})
sub_aux_df = aux_df.loc[aux_df["label"].isin(["econ", "nlin", "nucl-ex", "q-bio", "hep-lat", "hep-ex", "math-ph", "nucl-th","q-fin"])]

In [21]:
sub_aux_df["label"].value_counts()


label
q-bio      677
nucl-th    412
math-ph    365
econ       288
nlin       262
q-fin      231
hep-ex     203
hep-lat    137
nucl-ex    100
Name: count, dtype: int64

In [24]:
aug_stream_df = pd.concat([stream_df, sub_aux_df])
print(f"Shape after augmentation: {aug_stream_df.shape}")
aug_stream_df["label"].value_counts()


Shape after augmentation: (13818, 2)


label
cs          4747
math        1913
cond-mat     795
q-bio        763
physics      711
astro-ph     700
quant-ph     521
eess         485
nucl-th      484
math-ph      408
econ         367
nlin         309
q-fin        274
hep-ex       253
stat         229
hep-ph       223
gr-qc        190
hep-lat      163
hep-th       154
nucl-ex      129
Name: count, dtype: int64

In [25]:
train_df, temp_df = train_test_split(
    aug_stream_df, 
    test_size=0.2,
    stratify=aug_stream_df["label"],
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

aug_stream_data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(aug_stream_df["label"].unique())
class_label = ClassLabel(names=labels)

aug_stream_data = aug_stream_data.cast_column("label", class_label)

aug_stream_data.save_to_disk("data/processed/aug_stream_data")


Casting the dataset:   0%|          | 0/11054 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1382 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11054 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1382 [00:00<?, ? examples/s]