# Scrape arXiv

In [7]:
import requests, os
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta


def scrape_arxiv_day(date_str, subject="cs.AI"):
    """
    Scrapes arXiv 'catchup' listing for a given date and subject.
    Returns a pandas DataFrame with arxiv_id, title, authors, abstract, and link columns.
    """
    url = f"https://arxiv.org/catchup/{subject}/{date_str}?abs=True"
    print(f"Fetching: {url}")
    resp = requests.get(url)
    resp.raise_for_status()
    
    soup = BeautifulSoup(resp.text, "html.parser")
    articles = []
    
    # Each paper entry is a <dt> (with link) + <dd> (with metadata)
    for dt, dd in zip(soup.find_all("dt"), soup.find_all("dd")):
        # Extract arXiv ID and construct full abstract link
        abs_tag = dt.find("a", title="Abstract")
        if not abs_tag:
            continue
        arxiv_id = abs_tag.text.strip()
        abs_link = "https://arxiv.org" + abs_tag["href"]

        # Extract title
        title_tag = dd.find("div", class_="list-title")
        title = title_tag.get_text(strip=True).replace("Title:", "").strip() if title_tag else None

        # Extract authors
        authors_tag = dd.find("div", class_="list-authors")
        if authors_tag:
            # Each author is an <a> tag inside the div
            author_names = [a.get_text(strip=True) for a in authors_tag.find_all("a")]
            authors = ", ".join(author_names)
        else:
            authors = None

        # Extract abstract
        abs_paragraph = dd.find("p", class_="mathjax")
        abstract = abs_paragraph.get_text(strip=True) if abs_paragraph else None

        articles.append({
            "arxiv_id": arxiv_id,
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "link": abs_link
        })

    df = pd.DataFrame(articles)
    return df

def scrape_arxiv_range(start_date:datetime.date, end_date:datetime.date, skip_dates=[], subject="cs.AI"):
    """
    scrapes a date range
    """
    delta = (end_date - start_date).days
    dates = [start_date + timedelta(days=i) for i in range(delta + 1)]
    dfs = []
    for date in dates:
        if date not in skip_dates:
            d = scrape_arxiv_day(str(date), subject=subject)
            d['date'] = str(date)
            dfs.append(d)
    return pd.concat(dfs).reset_index(drop=True)

def scrape_arxiv_recent(n_days=3, df=None, subject="cs.AI"):
    """
    Just grab the last n days of research data
    """
    start_date = (datetime.now() - timedelta(days=n_days-1)).date()
    end_date = datetime.now().date()
    skip_dates = [] if df is None or len(df)==0 else [d.date() for d in pd.to_datetime(df['date'].unique())][:-1]
    return scrape_arxiv_range(start_date, end_date, skip_dates=skip_dates, subject=subject)

def update_paper_data(df_path, n_days=3, subject="cs.AI"):
    if os.path.exists(df_path):
        df1 = pd.read_pickle(df_path)
    else:
        df1 = pd.DataFrame()
    df2 = scrape_arxiv_recent(n_days=n_days, df=df1, subject=subject)
    df = pd.concat([df1, df2]).drop_duplicates(subset='arxiv_id', keep='last')
    df.to_pickle(df_path)

In [8]:
update_paper_data('paper_data.pkl', n_days=90)

Fetching: https://arxiv.org/catchup/cs.AI/2025-09-19?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-20?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-21?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-22?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-23?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-24?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-25?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-26?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-27?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-28?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-29?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-09-30?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-10-01?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-10-02?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-10-03?abs=True
Fetching: https://arxiv.org/catchup/cs.AI/2025-10-04?abs=True
Fetching

# Embed the Abstracts and compare against a reference

In [1]:
import torch, os, shutil
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path


tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
# model = model.to('cuda')

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_embeddings(model, tokenizer, texts, max_length=8192):
    batch_dict = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    batch_dict.to(model.device)
    with torch.no_grad():
        outputs = model(**batch_dict)
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        return F.normalize(embeddings, p=2, dim=1)

def embed_df(df_pre):
    if os.path.exists('tmp_embed_dfs'):
        shutil.rmtree('tmp_embed_dfs')
    os.mkdir('tmp_embed_dfs')
    for i in tqdm(range(0, len(df_pre), 100)):
        df_path = f"tmp_embed_dfs/{i}.pkl"
        if not os.path.exists(df_path):
            df = df_pre.iloc[i:i + 100].reset_index(drop=True).copy(deep=True)
            texts = [f"Title: {t}; Authors: {au}; Abstract: {ab}" for t, au, ab in df[['title','authors','abstract']].to_numpy()]
            
            embeddings = get_embeddings(model, tokenizer, texts)
                    
            df['embedding'] = embeddings.tolist()
            df.to_pickle(df_path)
            
    df_fin = pd.concat([pd.read_pickle(str(p)) for p in Path('tmp_embed_dfs').rglob('*.pkl')]).reset_index(drop=True)
    # df_fin.to_pickle('paper_data_embed.pkl')
    shutil.rmtree('tmp_embed_dfs')
    return df_fin

In [None]:
df = pd.DataFrame(columns=['embedding']) if not os.path.exists('paper_data_embed.pkl') else pd.read_pickle('paper_data_embed.pkl').reset_index(drop=True)
df_new = pd.read_pickle('paper_data.pkl').reset_index(drop=True)

df_new = pd.concat([df,df_new]).drop_duplicates(subset='arxiv_id', keep='first')
df_new = df_new[df_new['embedding'].isna()]

if len(df_new)>0:
    df_new = embed_df(df_new)
    
    df = pd.concat([df, df_new]).reset_index(drop=True)
    df.to_pickle('paper_data_embed.pkl')

  0%|          | 0/181 [00:00<?, ?it/s]

In [13]:
import torch
texts_of_interest = [
    "Title: AST: Audio Spectrogram Transformer; Authors: Yuan Gong, Yu-An Chung, James Glass; Abstract: In the past decade, convolutional neural networks (CNNs) have been widely adopted as the main building block for end-to-end audio classification models, which aim to learn a direct mapping from audio spectrograms to corresponding labels. To better capture long-range global context, a recent trend is to add a self-attention mechanism on top of the CNN, forming a CNN-attention hybrid model. However, it is unclear whether the reliance on a CNN is necessary, and if neural networks purely based on attention are sufficient to obtain good performance in audio classification. In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2.",
    "Title: SSAST: Self-Supervised Audio Spectrogram Transformer; Authors: Yuan Gong, Cheng-I Jeff Lai, Yu-An Chung, James Glass; Abstract: Recently, neural networks based purely on self-attention, such as the Vision Transformer (ViT), have been shown to outperform deep learning models constructed with convolutional neural networks (CNNs) on various vision tasks, thus extending the success of Transformers, which were originally developed for language processing, to the vision domain. A recent study showed that a similar methodology can also be applied to the audio domain. Specifically, the Audio Spectrogram Transformer (AST) achieves state-of-the-art results on various audio classification benchmarks. However, pure Transformer models tend to require more training data compared to CNNs, and the success of the AST relies on supervised pretraining that requires a large amount of labeled data and a complex training pipeline, thus limiting the practical usage of AST. This paper focuses on audio and speech classification, and aims to reduce the need for large amounts of labeled data for AST by leveraging self-supervised learning using unlabeled data. Specifically, we propose to pretrain the AST model with joint discriminative and generative masked spectrogram patch modeling (MSPM) using unlabeled audio from AudioSet and Librispeech. We evaluate our pretrained models on both audio and speech classification tasks including audio event classification, keyword spotting, emotion recognition, and speaker identification. The proposed self-supervised framework significantly boosts AST performance on all tasks, with an average improvement of 60.9%, leading to similar or even better results than a supervised pretrained AST. To the best of our knowledge, it is the first patch-based self-supervised learning framework in the audio and speech domain, and also the first self-supervised learning framework for AST.",
    "Title: SSAMBA: Self-Supervised Audio Representation Learning with Mamba State Space Model; Authors: Siavash Shams, Sukru Samet Dindar, Xilin Jiang, Nima Mesgarani; Abstract: Transformers have revolutionized deep learning across various tasks, including audio representation learning, due to their powerful modeling capabilities. However, they often suffer from quadratic complexity in both GPU memory usage and computational inference time, affecting their efficiency. Recently, state space models (SSMs) like Mamba have emerged as a promising alternative, offering a more efficient approach by avoiding these complexities. Given these advantages, we explore the potential of SSM-based models in audio tasks. In this paper, we introduce Self-Supervised Audio Mamba (SSAMBA), the first self-supervised, attention-free, and SSM-based model for audio representation learning. SSAMBA leverages the bidirectional Mamba to capture complex audio patterns effectively. We incorporate a self-supervised pretraining framework that optimizes both discriminative and generative objectives, enabling the model to learn robust audio representations from large-scale, unlabeled datasets. We evaluated SSAMBA on various tasks such as audio classification, keyword spotting, and speaker identification. Our results demonstrate that SSAMBA outperforms the Self-Supervised Audio Spectrogram Transformer (SSAST) in most tasks. Notably, SSAMBA is approximately 92.7% faster in batch inference speed and 95.4% more memory-efficient than SSAST for the tiny model size with an input token size of 22k. These efficiency gains, combined with superior performance, underscore the effectiveness of SSAMBA's architectural innovation, making it a compelling choice for a wide range of audio processing applications.",
]

weights = torch.tensor([[
    1,
    0.5,
    0.5,
]])

embeddings_of_interest = get_embeddings(model, tokenizer, texts_of_interest)

embeddings = torch.tensor(df['embedding'].tolist())

scores = embeddings_of_interest @ embeddings.T
weighted_scores = torch.sum(weights.T * scores, dim=0)

df = df.iloc[weighted_scores.argsort().flip(dims=[0]).tolist()]

In [16]:
r = 0
ex = df.iloc[r]
print(ex['abstract'])
display(ex)

In recent years, self-supervised learning has amassed significant interest for training deep neural representations without labeled data. One such self-supervised learning approach is masked spectrogram modeling, where the objective is to learn semantically rich contextual representations by predicting removed or hidden portions of the input audio spectrogram. With the Transformer neural architecture at its core, masked spectrogram modeling has emerged as the prominent approach for learning general purpose audio representations, a.k.a. audio foundation models. Meanwhile, addressing the issues of the Transformer architecture, in particular the underlying Scaled Dot-product Attention operation, which scales quadratically with input sequence length, has led to renewed interest in recurrent sequence modeling approaches. Among them, Selective structured state space models (such as Mamba) and extended Long Short-Term Memory (xLSTM) are the two most promising approaches which have experienced

date                                                2025-09-24
arxiv_id                                      arXiv:2509.18691
title        An overview of neural architectures for self-s...
authors      Sarthak Yadav, Sergios Theodoridis, Zheng-Hua Tan
abstract     In recent years, self-supervised learning has ...
link                          https://arxiv.org/abs/2509.18691
embedding    [-0.033794302493333817, -0.025241412222385406,...
Name: 8022, dtype: object