In [1]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [2]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
stream_df = stream_df[["text", "label"]]


In [3]:
stream_df["label"].value_counts()


label
cs          4747
math        1913
cond-mat     795
physics      711
astro-ph     700
quant-ph     521
eess         485
stat         229
hep-ph       223
gr-qc        190
hep-th       154
q-bio         86
econ          79
nucl-th       72
hep-ex        50
nlin          47
math-ph       43
q-fin         43
nucl-ex       29
hep-lat       26
Name: count, dtype: int64

In [4]:
train_df, temp_df = train_test_split(
    stream_df, 
    test_size=0.3,
    stratify=stream_df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

train_df["label"].value_counts()

label
cs          3323
math        1339
cond-mat     557
physics      498
astro-ph     490
quant-ph     365
eess         340
stat         160
hep-ph       156
gr-qc        133
hep-th       108
q-bio         60
econ          55
nucl-th       50
hep-ex        35
nlin          33
q-fin         30
math-ph       30
nucl-ex       20
hep-lat       18
Name: count, dtype: int64

In [5]:
aux_data = load_dataset("real-jiakai/arxiver-with-category")

aux_data.set_format(type="pandas")
aux_df = aux_data["train"][:]
aux_df["label"] = aux_df["primary_category"].apply(map_category)
aux_df["text"] = aux_df["title"] + "\n" + aux_df["abstract"]
aux_df = aux_df[["text", "label"]]


In [6]:
aux_df["label"].value_counts()


label
cs          26733
math         9611
cond-mat     4660
astro-ph     4453
physics      4163
quant-ph     2930
eess         2839
hep-ph       1642
stat         1546
gr-qc        1187
hep-th        918
q-bio         677
nucl-th       412
math-ph       365
econ          288
nlin          262
q-fin         231
hep-ex        203
hep-lat       137
nucl-ex       100
Name: count, dtype: int64

In [7]:
train_counts = train_df["label"].value_counts()
aux_counts = aux_df["label"].value_counts()

TARGET_CLASS_SIZE = 500

samples_to_take = {}

for label in train_counts.index.union(aux_counts.index):
    train_count = train_counts.get(label, 0)
    aux_count = aux_counts.get(label, 0)
    
    desired = min(TARGET_CLASS_SIZE, train_count + aux_count)
    need = desired - train_count
    take = min(need, aux_count)
    samples_to_take[label] = max(take, 0)  # avoid negatives


In [8]:
samples_to_take

{'astro-ph': np.int64(10),
 'cond-mat': 0,
 'cs': 0,
 'econ': np.int64(288),
 'eess': np.int64(160),
 'gr-qc': np.int64(367),
 'hep-ex': np.int64(203),
 'hep-lat': np.int64(137),
 'hep-ph': np.int64(344),
 'hep-th': np.int64(392),
 'math': 0,
 'math-ph': np.int64(365),
 'nlin': np.int64(262),
 'nucl-ex': np.int64(100),
 'nucl-th': np.int64(412),
 'physics': np.int64(2),
 'q-bio': np.int64(440),
 'q-fin': np.int64(231),
 'quant-ph': np.int64(135),
 'stat': np.int64(340)}

In [9]:
sampled_aux = []

for label, take_n in samples_to_take.items():
    if take_n > 0:
        class_subset = aux_df[aux_df["label"] == label]
        sampled = class_subset.sample(n=take_n, random_state=42)
        sampled_aux.append(sampled)

aug_aux_df = pd.concat(sampled_aux, ignore_index=True)

aug_train_df = pd.concat([train_df, aug_aux_df], ignore_index=True)

aug_train_df["label"].value_counts()


label
cs          3323
math        1339
cond-mat     557
astro-ph     500
q-bio        500
eess         500
quant-ph     500
hep-ph       500
physics      500
hep-th       500
stat         500
gr-qc        500
nucl-th      462
math-ph      395
econ         343
nlin         295
q-fin        261
hep-ex       238
hep-lat      155
nucl-ex      120
Name: count, dtype: int64

In [10]:
# sub_aux_df = aux_df.loc[aux_df["label"].isin(["econ", "nlin", "nucl-ex", "q-bio","hep-lat", "hep-ex", "math-ph", "nucl-th","q-fin", "hep-th"])]


In [11]:
# sub_aux_df["label"].value_counts()


In [12]:
# aug_train_df = pd.concat([train_df, sub_aux_df])
print(f"Shape after augmentation: {aug_train_df.shape}")
aug_train_df["label"].value_counts()


Shape after augmentation: (11988, 2)


label
cs          3323
math        1339
cond-mat     557
astro-ph     500
q-bio        500
eess         500
quant-ph     500
hep-ph       500
physics      500
hep-th       500
stat         500
gr-qc        500
nucl-th      462
math-ph      395
econ         343
nlin         295
q-fin        261
hep-ex       238
hep-lat      155
nucl-ex      120
Name: count, dtype: int64

In [13]:
aug_stream_data = DatasetDict({
    "train": Dataset.from_pandas(aug_train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(aug_train_df["label"].unique())
class_label = ClassLabel(names=labels)

aug_stream_data = aug_stream_data.cast_column("label", class_label)

aug_stream_data.save_to_disk("data/processed/aug_stream_data")


Casting the dataset:   0%|          | 0/11988 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1671 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1672 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11988 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1672 [00:00<?, ? examples/s]