In [13]:
import random

import pandas as pd
from datasets import load_dataset

In [14]:
dataset = load_dataset("TimSchopf/arxiv_categories")

In [15]:
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [16]:
df_train = pd.DataFrame(train_data)
df_validation = pd.DataFrame(validation_data)
df_test = pd.DataFrame(test_data)

In [17]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'categories', 'creation_date'],
        num_rows: 163168
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'categories', 'creation_date'],
        num_rows: 20396
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'categories', 'creation_date'],
        num_rows: 20397
    })
})


In [18]:
print(df_train.head())

           id                                              title  \
0  2204.14117  A Comparative Study of Meter Detection Methods...   
1  2305.19887  The Markov chain embedding problem in a low ju...   
2   0910.5857  Chaotic Transport and Chronology of Complex As...   
3  1801.10207          FITing-Tree: A Data-aware Index Structure   
4   0803.0849    The Universal Cardinal Ordering of Fixed Points   

                                            abstract  \
0  In order to read meter values from a camera on...   
1  We consider the problem of finding the transit...   
2  We present a transport model that describes th...   
3  Index structures are one of the most important...   
4  We present the theorem which determines, by a ...   

                                 categories             creation_date  
0         [Computer Science Archive->cs.CV] 2022-04-24 13:59:57+00:00  
1            [Mathematics Archive->math.PR] 2023-05-31 14:24:25+00:00  
2  [Physics Archive->astro-ph->astro-p

In [19]:
# Function to filter rows based on category depth
def has_subsubcategory(category_list):
    """Returns True if any category in the list has at least 3 levels (two '->' separators)."""
    for category in category_list:
        if category.count("->") >= 2:
            return True
    return False


# Apply filtering
df_train_filtered = df_train[df_train["categories"].apply(has_subsubcategory)]
df_validation_filtered = df_validation[
    df_validation["categories"].apply(has_subsubcategory)
]
df_test_filtered = df_test[df_test["categories"].apply(has_subsubcategory)]

print(df_train_filtered["categories"].head(10))  # Check sample categories

2              [Physics Archive->astro-ph->astro-ph.EP]
4                      [Physics Archive->nlin->nlin.CD]
5              [Physics Archive->astro-ph->astro-ph.SR]
6           [Physics Archive->physics->physics.atom-ph]
8     [Physics Archive->astro-ph->astro-ph.GA, Physi...
10    [Physics Archive->astro-ph->astro-ph.GA, Physi...
12       [Physics Archive->cond-mat->cond-mat.mtrl-sci]
14    [Physics Archive->gr-qc, Physics Archive->hep-...
15    [Physics Archive->gr-qc, Physics Archive->hep-...
19    [Physics Archive->astro-ph->astro-ph.GA, Physi...
Name: categories, dtype: object


In [20]:
# Function to extract subsubcategory from category list
def extract_subsubcategory(categories):
    """Returns a list of subsubcategories (third level in category structure)."""
    subsubcategories = []
    for category in categories:
        parts = category.split("->")
        if len(parts) >= 3:  # Ensure there's a subsubcategory
            subsubcategories.append(parts[2])  # Take the third element
    return subsubcategories if subsubcategories else None


# Make an explicit copy to avoid SettingWithCopyWarning
df_train_filtered = df_train_filtered.copy()

# Apply function safely
df_train_filtered.loc[:, "subsubcategory"] = df_train_filtered["categories"].apply(
    extract_subsubcategory
)

# Remove rows where no subsubcategory was found
df_train_filtered = df_train_filtered.dropna(subset=["subsubcategory"])

# Explode in case there are multiple subsubcategories (optional, can be removed)
df_train_exploded = df_train_filtered.explode("subsubcategory")

# Select only required columns
df_train_exploded = df_train_exploded[["title", "abstract", "subsubcategory"]]


print(df_train_exploded)

                                                    title  \
2       Chaotic Transport and Chronology of Complex As...   
4         The Universal Cardinal Ordering of Fixed Points   
5       Mapping the hidden magnetic field of the quiet...   
6       Interaction of atomic quantum gases with a sin...   
8       The Rate and Spatial Distribution of Novae in ...   
...                                                   ...   
163161  Using a Differential Emission Measure and Dens...   
163163  Induced Chern-Simons modified gravity at finit...   
163165  GM2Calc: Precise MSSM prediction for $(g - 2)$...   
163167  Gravitationally Induced Particle Production th...   
163167  Gravitationally Induced Particle Production th...   

                                                 abstract   subsubcategory  
2       We present a transport model that describes th...      astro-ph.EP  
4       We present the theorem which determines, by a ...          nlin.CD  
5       The Sun is the only star whe

In [21]:
# Function to extract both subcategory and subsubcategory from the category list.
def extract_subcat_pair(categories):
    """
    Given a list of category strings (e.g. "Physics Archive->astro-ph->astro-ph.SR"),
    returns a list of tuples: (subcategory, subsubcategory) from each string.
    """
    pairs = []
    for category in categories:
        parts = category.split("->")
        if len(parts) >= 3:  # Only consider if there's a main, sub, and subsubcategory.
            # parts[1] is the subcategory, parts[2] is the subsubcategory.
            pairs.append((parts[1], parts[2]))
    return pairs if pairs else None


# Make an explicit copy to avoid SettingWithCopyWarning.
df_train_filtered = df_train_filtered.copy()

# Apply the function to extract (subcategory, subsubcategory) pairs.
df_train_filtered.loc[:, "subcat_pair"] = df_train_filtered["categories"].apply(
    extract_subcat_pair
)

# Remove rows where no subcat pair was found.
df_train_filtered = df_train_filtered.dropna(subset=["subcat_pair"])

# Explode the "subcat_pair" column so that each row gets a single (subcategory, subsubcategory) tuple.
df_train_exploded = df_train_filtered.explode("subcat_pair")

# Split the tuple into two new columns: "subcategory" and "subsubcategory".
df_train_exploded.loc[:, "subcategory"] = df_train_exploded["subcat_pair"].apply(
    lambda x: x[0]
)
df_train_exploded.loc[:, "subsubcategory"] = df_train_exploded["subcat_pair"].apply(
    lambda x: x[1]
)

# Select only the required columns.
df_train_exploded = df_train_exploded[
    ["title", "abstract", "subcategory", "subsubcategory"]
]

print(df_train_exploded)

                                                    title  \
2       Chaotic Transport and Chronology of Complex As...   
4         The Universal Cardinal Ordering of Fixed Points   
5       Mapping the hidden magnetic field of the quiet...   
6       Interaction of atomic quantum gases with a sin...   
8       The Rate and Spatial Distribution of Novae in ...   
...                                                   ...   
163161  Using a Differential Emission Measure and Dens...   
163163  Induced Chern-Simons modified gravity at finit...   
163165  GM2Calc: Precise MSSM prediction for $(g - 2)$...   
163167  Gravitationally Induced Particle Production th...   
163167  Gravitationally Induced Particle Production th...   

                                                 abstract subcategory  \
2       We present a transport model that describes th...    astro-ph   
4       We present the theorem which determines, by a ...        nlin   
5       The Sun is the only star where we can re

In [22]:
# Create a dictionary where keys are subsubcategories and values are lists of papers
subsubcat_dict = (
    df_train_exploded.groupby("subsubcategory")
    .apply(lambda x: x.to_dict(orient="records"))
    .to_dict()
)

# Step 1: Generate all positive pairs across all subsubcategories.
all_pairs = []  # List to hold all pairs, along with their subsubcategory if needed.
for subsubcat, papers in subsubcat_dict.items():
    # Only consider subsubcategories with at least 2 papers.
    if len(papers) < 2:
        continue
    # Create pairs from adjacent papers.
    pairs = list(zip(papers[:-1], papers[1:]))
    # Shuffle pairs to randomize order within the subsubcategory.
    random.shuffle(pairs)
    # Add these pairs to the overall list.
    all_pairs.extend(pairs)

# Check if there are at least 500 pairs available.
if len(all_pairs) < 500:
    raise ValueError("Not enough positive pairs available.")

# Step 2: Randomly sample 500 pairs from the complete list.
positive_pairs = random.sample(all_pairs, 500)

# Optionally, create a DataFrame to store the pairs.
df_positive_pairs = pd.DataFrame(
    [
        (
            p1["title"],
            p1["abstract"],
            p1["subcategory"],
            p1["subsubcategory"],
            p2["title"],
            p2["abstract"],
            p2["subcategory"],
            p2["subsubcategory"],
        )
        for p1, p2 in positive_pairs
    ],
    columns=[
        "title_1",
        "abstract_1",
        "subcategory_1",
        "subsubcategory_1",
        "title_2",
        "abstract_2",
        "subcategory_2",
        "subsubcategory_2",
    ],
)

print(df_positive_pairs)

  .apply(lambda x: x.to_dict(orient="records"))


                                               title_1  \
0    The $UBV$ Color Evolution of Classical Novae. ...   
1    Evolution along the sequence of S0 Hubble type...   
2    Three orbital model for the iron-based superco...   
3    On the radiation energy density in the jet of ...   
4    Search for high energy {\gamma}-ray emission f...   
..                                                 ...   
495  Metastable massive gravitons from an infinite ...   
496  Two-Moment Neutrino Flavor Transformation with...   
497  Compact Binaries in Star Clusters I - Black Ho...   
498  Dielectric properties and lattice dynamics of ...   
499  On the absolute value of the air-fluorescence ...   

                                            abstract_1 subcategory_1  \
0    Light curves and color evolutions of two class...      astro-ph   
1    Galaxy mergers are considered as questionable ...      astro-ph   
2    The theoretical need to study the properties o...      cond-mat   
3    We examine

In [23]:
# Build a dictionary mapping subcategory -> subsub_dict,
# where subsub_dict maps each subsubcategory to the list of paper records.
subcat_groups = {}
for subcat, group in df_train_exploded.groupby("subcategory"):
    subsub_dict = {}
    for subsub, subsub_group in group.groupby("subsubcategory"):
        subsub_dict[subsub] = subsub_group.to_dict(orient="records")
    # Only consider subcategories with at least two distinct subsubcategories.
    if len(subsub_dict) >= 2:
        subcat_groups[subcat] = subsub_dict

# Now, randomly sample negative pairs until we have 500.
negative_pairs = []
attempts = 0
max_attempts = 10000  # safeguard to avoid an infinite loop
while len(negative_pairs) < 500 and attempts < max_attempts:
    attempts += 1
    # Randomly choose a subcategory that has at least 2 subsubcategories.
    subcat = random.choice(list(subcat_groups.keys()))
    subsub_dict = subcat_groups[subcat]
    subsub_keys = list(subsub_dict.keys())
    # Randomly select two distinct subsubcategories from this subcategory.
    subsub1, subsub2 = random.sample(subsub_keys, 2)
    # Randomly pick one paper from each subsubcategory.
    paper1 = random.choice(subsub_dict[subsub1])
    paper2 = random.choice(subsub_dict[subsub2])
    negative_pairs.append((paper1, paper2))

if len(negative_pairs) < 500:
    raise ValueError("Could not sample 500 negative pairs after many attempts.")

# Convert the negative pairs into a DataFrame.
df_negative_pairs = pd.DataFrame(
    [
        (
            p1["title"],
            p1["abstract"],
            p1["subcategory"],
            p1["subsubcategory"],
            p2["title"],
            p2["abstract"],
            p2["subcategory"],
            p2["subsubcategory"],
        )
        for p1, p2 in negative_pairs
    ],
    columns=[
        "title_1",
        "abstract_1",
        "subcategory_1",
        "subsubcategory_1",
        "title_2",
        "abstract_2",
        "subcategory_2",
        "subsubcategory_2",
    ],
)

print(df_negative_pairs)

                                               title_1  \
0    The Ferromagnetic Potts model under an externa...   
1    Proper-time measurement in accelerated relativ...   
2    Searching for orbital decay in a heartbeat sta...   
3    Quantitative Method for the Optimal Subtractio...   
4    Koopman Operator and Phase Space Partition of ...   
..                                                 ...   
495  Decline in extinction rates and scale invarian...   
496  Scattering of Bunched Fractionally Charged Qua...   
497                         Forward Physics at the LHC   
498  Final-state interactions in two-nucleon knocko...   
499              Varying constants driven baryogenesis   

                                            abstract_1 subcategory_1  \
0    The q-state ferromagnetic Potts model under a ...      cond-mat   
1    Separate constituents of extended systems meas...       physics   
2    Theory suggests that the orbits of a large fra...      astro-ph   
3    We present

In [25]:
# For positive pairs DataFrame (e.g., df_positive_pairs)
df_positive_pairs["label"] = 1  # Mark as positive pairs
df_positive_pairs.to_csv("positive_pairs.csv", index=False)

# For negative pairs DataFrame (e.g., df_negative_pairs)
df_negative_pairs["label"] = 0  # Mark as negative pairs
df_negative_pairs.to_csv("negative_pairs.csv", index=False)