In [1]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [2]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
stream_df = stream_df[["text", "label"]]


In [3]:
stream_df["label"].value_counts()


label
cs          5458
math        2205
cond-mat     915
physics      822
astro-ph     802
quant-ph     599
eess         565
hep          527
stat         271
gr-qc        213
nucl         116
q-bio         98
econ          93
nlin          54
math-ph       52
q-fin         46
Name: count, dtype: int64

In [4]:
train_df, temp_df = train_test_split(
    stream_df, 
    test_size=0.3,
    stratify=stream_df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

train_df["label"].value_counts()


label
cs          3821
math        1543
cond-mat     641
physics      575
astro-ph     561
quant-ph     419
eess         396
hep          369
stat         190
gr-qc        149
nucl          81
q-bio         69
econ          65
nlin          38
math-ph       36
q-fin         32
Name: count, dtype: int64

In [5]:
aux_data = load_dataset("real-jiakai/arxiver-with-category")
aux_data


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'authors', 'published_date', 'link', 'markdown', 'primary_category', 'categories'],
        num_rows: 63357
    })
})

In [5]:
aux_data = load_dataset("real-jiakai/arxiver-with-category")

aux_data.set_format(type="pandas")
aux_df = aux_data["train"][:]
aux_df["label"] = aux_df["primary_category"].apply(map_category)
aux_df["title"] = aux_df["title"].str.replace("\n  ", " ")
aux_df["text"] = aux_df["title"] + "\n" + aux_df["abstract"]
aux_df = aux_df[["text", "label"]]


README.md:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

papers.parquet:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/63357 [00:00<?, ? examples/s]

In [6]:
aux_df["label"].value_counts()


label
cs          26733
math         9611
cond-mat     4660
astro-ph     4453
physics      4163
quant-ph     2930
hep          2900
eess         2839
stat         1546
gr-qc        1187
q-bio         677
nucl          512
math-ph       365
econ          288
nlin          262
q-fin         231
Name: count, dtype: int64

In [7]:
all_train_df = pd.concat([train_df, aux_df], ignore_index=True)

all_train_df["label"].value_counts()


label
cs          30554
math        11154
cond-mat     5301
astro-ph     5014
physics      4738
quant-ph     3349
hep          3269
eess         3235
stat         1736
gr-qc        1336
q-bio         746
nucl          593
math-ph       401
econ          353
nlin          300
q-fin         263
Name: count, dtype: int64

In [8]:
# aug_train_df = pd.concat([train_df, sub_aux_df])
print(f"Shape after augmentation: {all_train_df.shape}")
all_train_df["label"].value_counts()


Shape after augmentation: (72342, 2)


label
cs          30554
math        11154
cond-mat     5301
astro-ph     5014
physics      4738
quant-ph     3349
hep          3269
eess         3235
stat         1736
gr-qc        1336
q-bio         746
nucl          593
math-ph       401
econ          353
nlin          300
q-fin         263
Name: count, dtype: int64

In [9]:
all_stream_data = DatasetDict({
    "train": Dataset.from_pandas(all_train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(all_train_df["label"].unique())
class_label = ClassLabel(names=labels)

all_stream_data = all_stream_data.cast_column("label", class_label)

all_stream_data.save_to_disk("data/processed/all_stream_data")


Casting the dataset:   0%|          | 0/72342 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1925 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1926 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/72342 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1925 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1926 [00:00<?, ? examples/s]