In [6]:
from datasets import DatasetDict, Dataset, ClassLabel, load_dataset
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [7]:
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
stream_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

stream_df["label"] = stream_df["main_category"].apply(map_category)
stream_df["text"] = stream_df["title"] + "\n" + stream_df["summary"]
stream_df = stream_df[["text", "label"]]


In [None]:
train_df, temp_df = train_test_split(
    stream_df, 
    test_size=0.3,
    stratify=stream_df["label"],
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

stream_data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

labels = sorted(stream_df["label"].unique())
class_label = ClassLabel(names=labels)

stream_data = stream_data.cast_column("label", class_label)


In [5]:
from pprint import pprint

pprint(stream_data["train"]["text"][0])


('VLM-R1: A Stable and Generalizable R1-style Large Vision-Language Model\n'
 'Recently DeepSeek R1 has shown that reinforcement learning (RL) can\n'
 'substantially improve the reasoning capabilities of Large Language Models\n'
 '(LLMs) through a simple yet effective design. The core of R1 lies in its\n'
 'rule-based reward formulation, which leverages tasks with deterministic\n'
 'ground-truth answers to enable precise and stable reward computation. In '
 'the\n'
 'visual domain, we similarly observe that a wide range of visual '
 'understanding\n'
 'tasks are inherently equipped with well-defined ground-truth annotations. '
 'This\n'
 'property makes them naturally compatible with rule-based reward mechanisms.\n'
 'Motivated by this observation, we investigate the extension of R1-style\n'
 'reinforcement learning to Vision-Language Models (VLMs), aiming to enhance\n'
 'their visual reasoning capabilities. To this end, we develop VLM-R1, a\n'
 "dedicated framework designed to harness

In [None]:
stream_data.save_to_disk("data/processed/stream_data")


In [3]:
val_df["label"].value_counts()


label
cs          818
math        331
cond-mat    137
physics     123
astro-ph    120
quant-ph     90
eess         84
hep          79
stat         41
gr-qc        32
nucl         18
q-bio        15
econ         14
math-ph       8
nlin          8
q-fin         7
Name: count, dtype: int64