In [1]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [2]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
# stream_df = stream_df[["text", "label"]]


In [3]:
stream_df


Unnamed: 0,aid,title,summary,main_category,categories,published,label,text
0,http://arxiv.org/abs/2503.21663v1,DiPolMol-Py: A Python package for calculations...,"We present the python package DiPolMol-Py, whi...",physics.atom-ph,"physics.atom-ph,physics.comp-ph",2025-03-27T16:31:06Z,physics,DiPolMol-Py: A Python package for calculations...
1,http://arxiv.org/abs/2503.21708v1,Elementwise Layer Normalization,A recent paper proposed Dynamic Tanh (DyT) as ...,cs.LG,"cs.LG,cs.AI,cs.CL",2025-03-27T17:20:44Z,cs,Elementwise Layer Normalization\nA recent pape...
2,http://arxiv.org/abs/2503.21714v1,As easy as PIE: understanding when pruning cau...,Language Model (LM) pruning compresses the mod...,cs.CL,cs.CL,2025-03-27T17:26:32Z,cs,As easy as PIE: understanding when pruning cau...
3,http://arxiv.org/abs/2503.21723v1,OccRobNet : Occlusion Robust Network for Accur...,Occlusion is one of the challenging issues whe...,cs.CV,"cs.CV,cs.HC",2025-03-27T17:36:55Z,cs,OccRobNet : Occlusion Robust Network for Accur...
4,http://arxiv.org/abs/2503.21733v1,Fully dynamic biconnectivity in $\tilde{\mathc...,We present a deterministic fully-dynamic data ...,cs.DS,cs.DS,2025-03-27T17:47:18Z,cs,Fully dynamic biconnectivity in $\tilde{\mathc...
...,...,...,...,...,...,...,...,...
12831,http://arxiv.org/abs/2505.05392v1,Generalized chip firing and critical groups of...,Chip firing provides a way to study the sandpi...,math.CO,"math.CO,math.NT",2025-05-08T16:27:32Z,math,Generalized chip firing and critical groups of...
12832,http://arxiv.org/abs/2505.05433v1,Non-Markovianity in collision models with init...,Collision models (CMs) describe an open system...,quant-ph,quant-ph,2025-05-08T17:22:42Z,quant-ph,Non-Markovianity in collision models with init...
12833,http://arxiv.org/abs/2505.05439v1,Stabilization of Kac polynomials,We study the stabilization behavior of cohomol...,math.RT,"math.RT,math.AG",2025-05-08T17:30:18Z,math,Stabilization of Kac polynomials\nWe study the...
12834,http://arxiv.org/abs/2505.05444v1,The soft X-ray transient EP241021a: a cosmic e...,X-Ray Flashes (XRFs) are fast X-ray transients...,astro-ph.HE,astro-ph.HE,2025-05-08T17:35:31Z,astro-ph,The soft X-ray transient EP241021a: a cosmic e...


In [None]:
train_df, temp_df = train_test_split(
    stream_df, 
    test_size=0.3,
    stratify=stream_df["label"],
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

stream_data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(stream_df["label"].unique())
class_label = ClassLabel(names=labels)

stream_data = stream_data.cast_column("label", class_label)


In [5]:
from pprint import pprint

pprint(stream_data["train"]["text"][0])


('VLM-R1: A Stable and Generalizable R1-style Large Vision-Language Model\n'
 'Recently DeepSeek R1 has shown that reinforcement learning (RL) can\n'
 'substantially improve the reasoning capabilities of Large Language Models\n'
 '(LLMs) through a simple yet effective design. The core of R1 lies in its\n'
 'rule-based reward formulation, which leverages tasks with deterministic\n'
 'ground-truth answers to enable precise and stable reward computation. In '
 'the\n'
 'visual domain, we similarly observe that a wide range of visual '
 'understanding\n'
 'tasks are inherently equipped with well-defined ground-truth annotations. '
 'This\n'
 'property makes them naturally compatible with rule-based reward mechanisms.\n'
 'Motivated by this observation, we investigate the extension of R1-style\n'
 'reinforcement learning to Vision-Language Models (VLMs), aiming to enhance\n'
 'their visual reasoning capabilities. To this end, we develop VLM-R1, a\n'
 "dedicated framework designed to harness

In [None]:
stream_data.save_to_disk("data/processed/stream_data")


In [3]:
val_df["label"].value_counts()


label
cs          818
math        331
cond-mat    137
physics     123
astro-ph    120
quant-ph     90
eess         84
hep          79
stat         41
gr-qc        32
nucl         18
q-bio        15
econ         14
math-ph       8
nlin          8
q-fin         7
Name: count, dtype: int64