<a href="https://colab.research.google.com/github/dhruvtre/Lossfunk_Code/blob/main/Arxiv_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=9ebfd83635ec1c845681719e84e5227de5f0a1a4d26ecf16449a7ac611e9fe29
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
!pip install datetime

Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading DateTime-5.5-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.8/259.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zope.interface, datetime
Successfully installed datetime-5.5 zope.interface-7.2


In [None]:
from datetime import datetime

### Making Search Query Function

In [None]:
# Cell A: Imports and helper function
import time
import requests
import feedparser
from typing import Literal
from urllib.parse import quote

def make_search_query(keywords: str | None, title: str | None, categories: list[str] | None, operator: str) -> str:
    """Build arXiv search query from keywords, title, and categories."""
    query_parts = []

    # Process keywords (search all fields)
    if keywords:
        # Split by whitespace and escape quotes
        terms = keywords.replace('"', '').split()
        if terms:
            keyword_query = f" {operator} ".join(f"all:{term}" for term in terms)
            query_parts.append(f"({keyword_query})")

    # Process title
    if title:
        # Split by whitespace and escape quotes
        terms = title.replace('"', '').split()
        if terms:
            title_query = f" {operator} ".join(f"ti:{term}" for term in terms)
            query_parts.append(f"({title_query})")

    # Process categories (always OR between categories)
    if categories:
        cat_query = " OR ".join(f"cat:{cat}" for cat in categories)
        query_parts.append(f"({cat_query})")

    # Join all parts with AND
    return " AND ".join(query_parts) if query_parts else ""

### Raw API Call Function

In [None]:
def make_arxiv_call(
    search_query: str,
    start: int = 0,
    max_results: int = 100,
    sort_by: str = "relevance",
    sort_order: str = "descending",
    retries: int = 3,
    backoff: float = 3.0,
) -> str:
    """Make HTTP call to arXiv API with retries and rate limiting.

    Returns the raw XML feed text.
    """
    # Build URL
    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": search_query,
        "start": start,
        "max_results": max_results,
        "sortBy": sort_by,
        "sortOrder": sort_order
    }

    # URL encode the parameters
    param_str = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
    url = f"{base_url}?{param_str}"

    print(f"Query URL: {url}")

    # Set headers with descriptive User-Agent
    headers = {
        "User-Agent": "arxiv-query-tool/1.0 (Python; research use)"
    }

    last_error = None

    for attempt in range(1, retries + 1):
        try:
            print(f"Attempt {attempt}/{retries}...")
            start_time = time.time()

            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            elapsed = time.time() - start_time
            print(f"Fetched in {elapsed:.2f}s")

            # Success - sleep to respect rate limit
            if attempt < retries:  # Don't sleep on last attempt
                print(f"Sleeping {backoff}s to respect rate limits...")
                time.sleep(backoff)

            return response.text

        except Exception as e:
            last_error = e
            print(f"Error on attempt {attempt}: {e}")

            # Sleep before retry
            if attempt < retries:
                print(f"Retrying in {backoff}s...")
                time.sleep(backoff)

    # All retries failed
    raise Exception(f"All {retries} attempts failed. Last error: {last_error}")

### Parsing Raw ARXIV Feed

In [None]:
def parse_arxiv_feed(feed_text: str) -> list[dict]:
    """Parse arXiv feed XML and extract paper metadata."""
    # Parse feed
    feed = feedparser.parse(feed_text)

    if hasattr(feed, 'bozo_exception') and feed.bozo_exception:
        raise Exception(f"Feed parsing error: {feed.bozo_exception}")

    # Extract papers
    papers = []
    # missing_doi_count = 0

    for entry in feed.entries:
        # Extract version from id
        arxiv_id = entry.id.split('/')[-1]  # Get ID from URL
        version = "v1"  # default
        if 'v' in arxiv_id:
            parts = arxiv_id.split('v')
            if len(parts) == 2 and parts[1].isdigit():
                version = f"v{parts[1]}"

        # # Extract DOI if available
        # doi = None
        # if hasattr(entry, 'arxiv_doi'):
        #     doi = entry.arxiv_doi
        # elif 'links' in entry:
        #     for link in entry.links:
        #         if link.get('title') == 'doi':
        #             doi = link.get('href', '').replace('http://dx.doi.org/', '')
        #             break

        # if not doi:
        #     missing_doi_count += 1

        # Build paper dict

        # Format dates to be more readable
        def format_date(date_str):
            """Convert ISO format to readable format"""
            try:
                dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
                return dt.strftime('%Y-%m-%d %H:%M:%S UTC')
            except:
                return date_str
        updated_date = format_date(entry.updated)
        published_date = format_date(entry.published)


        paper = {
            "id": arxiv_id,
            "version": version,
            "title": entry.title.replace('\n', ' ').strip(),
            "summary": entry.summary.replace('\n', ' ').strip(),
            "authors": [author.name for author in entry.authors],
            "categories": [tag.term for tag in entry.tags],
            "published": published_date,
            "updated": updated_date,
            # "doi": doi,
            "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        }
        papers.append(paper)

    print(f"Parsed {len(papers)} papers")
    # if missing_doi_count > 0:
    #     print(f"Warning: {missing_doi_count} papers missing DOI")

    return papers

### Wrapper Function

In [None]:
def arxiv_query(
    *,
    keywords: str | None = None,
    title: str | None = None,
    categories: list[str] | None = None,
    operator: Literal["AND", "OR"] = "AND",
    start: int = 0,
    max_results: int = 100,
    sort_by: Literal["relevance", "submittedDate", "lastUpdatedDate"] = "relevance",
    sort_order: Literal["ascending", "descending"] = "descending",
    retries: int = 3,
    backoff: float = 3.0,
) -> list[dict]:
    """Query arXiv API and return a list of paper metadata dictionaries.

    Args:
        keywords: Search terms for all fields
        title: Search terms for title only
        categories: List of arXiv categories (e.g., ["cs.AI", "cs.LG"])
        operator: How to combine search terms within keywords/title
        start: Starting index for results
        max_results: Maximum number of results to return
        sort_by: Sort criterion for results
        sort_order: Sort direction
        retries: Number of retry attempts on failure
        backoff: Seconds to wait between requests

    Returns:
        List of dictionaries containing paper metadata
    """
    # Step 1: Build search query
    search_query = make_search_query(keywords, title, categories, operator)
    if not search_query:
        print("Warning: No search criteria provided")
        return []

    # Step 2: Make API call
    try:
        raw_feed = make_arxiv_call(
            search_query=search_query,
            start=start,
            max_results=max_results,
            sort_by=sort_by,
            sort_order=sort_order,
            retries=retries,
            backoff=backoff
        )
    except Exception as e:
        print(f"Failed to fetch data from arXiv: {e}")
        return []

    # Step 3: Parse results
    try:
        papers = parse_arxiv_feed(raw_feed)
        return papers
    except Exception as e:
        print(f"Failed to parse arXiv feed: {e}")
        return []

### Tests

In [None]:
# Trying with simple keywords only - return (10)
print("=== Test 1: Keywords only (relevance) ===")
results1a = arxiv_query(
    keywords="transformer attention mechanism",
    max_results=5,
    sort_by="relevance",
    sort_order="descending"
)
print(f"Found {len(results1a)} papers\n")

# Check the first paper in detail
if results1a:
    paper = results1a[0]
    print("First paper details:")
    for key, value in paper.items():
        print(f"  {key}: {value}")

=== Test 1: Keywords only (relevance) ===
Query URL: http://export.arxiv.org/api/query?search_query=%28all%3Atransformer%20AND%20all%3Aattention%20AND%20all%3Amechanism%29&start=0&max_results=10&sortBy=relevance&sortOrder=descending
Attempt 1/3...
Fetched in 11.51s
Sleeping 3.0s to respect rate limits...
Parsed 10 papers
Found 10 papers



In [None]:
# Test 2: Simple title only (relevance)
print("=== Test 2: Title only (relevance) ===")
results2a = arxiv_query(
    title="attention is all you need",
    max_results=1,
    sort_by="relevance",
    sort_order="descending"
)
print(f"Found {len(results2a)} papers\n")

# Check the first paper in detail
if results2a:
    paper = results2a[0]
    print("First paper details:")
    for key, value in paper.items():
        print(f"  {key}: {value}")

=== Test 2: Title only (relevance) ===
Query URL: http://export.arxiv.org/api/query?search_query=%28ti%3Aattention%20AND%20ti%3Ais%20AND%20ti%3Aall%20AND%20ti%3Ayou%20AND%20ti%3Aneed%29&start=0&max_results=1&sortBy=relevance&sortOrder=descending
Attempt 1/3...
Fetched in 5.33s
Sleeping 3.0s to respect rate limits...
Parsed 1 papers
Found 1 papers

First paper details:
  id: 1706.03762v7
  version: v7
  title: Attention Is All You Need
  summary: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less ti

In [None]:
# Reference for Arxiv Categories - #TODO: Make a dictionary https://arxiv.org/category_taxonomy

In [None]:
# Trying with category and title - return (1)
# Test 3: Category and title
print("=== Test 3: Category + Title ===")
results3 = arxiv_query(
    title="attention is all you need",
    categories=["cs.AI", "cs.LG"],
    max_results=1,
    sort_by="relevance",
    sort_order="descending"
)
print(f"Found {len(results3)} papers")
if results3:
    print("\nFirst paper details:")
    for key, value in results3[0].items():
        print(f"  {key}: {value}")
print("\n" + "="*50 + "\n")

=== Test 3: Category + Title ===
Query URL: http://export.arxiv.org/api/query?search_query=%28ti%3Aattention%20AND%20ti%3Ais%20AND%20ti%3Aall%20AND%20ti%3Ayou%20AND%20ti%3Aneed%29%20AND%20%28cat%3Acs.AI%20OR%20cat%3Acs.LG%29&start=0&max_results=1&sortBy=relevance&sortOrder=descending
Attempt 1/3...
Fetched in 9.53s
Sleeping 3.0s to respect rate limits...
Parsed 1 papers
Found 1 papers

First paper details:
  id: 1706.03762v7
  version: v7
  title: Attention Is All You Need
  summary: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more paralleliza

In [20]:
# Trying with category and simple keywords - return (10)
# Test 4: Category and keywords
print("=== Test 4: Category + Keywords ===")
results4 = arxiv_query(
    keywords="diffusion models",
    categories=["cs.CV", "cs.LG"],
    max_results=1,
    sort_by="lastUpdatedDate",
    sort_order="descending"
)
print(f"Found {len(results4)} papers")
if results4:
    print("\nFirst paper details:")
    for key, value in results4[0].items():
        print(f"  {key}: {value}")
print("\n" + "="*50 + "\n")

=== Test 4: Category + Keywords ===
Query URL: http://export.arxiv.org/api/query?search_query=%28all%3Adiffusion%20AND%20all%3Amodels%29%20AND%20%28cat%3Acs.CV%20OR%20cat%3Acs.LG%29&start=0&max_results=1&sortBy=lastUpdatedDate&sortOrder=descending
Attempt 1/3...
Fetched in 7.94s
Sleeping 3.0s to respect rate limits...
Parsed 1 papers
Found 1 papers

First paper details:
  id: 2506.05350v1
  version: v1
  title: Contrastive Flow Matching
  summary: Unconditional flow-matching trains diffusion models to transport samples from a source distribution to a target distribution by enforcing that the flows between sample pairs are unique. However, in conditional settings (e.g., class-conditioned models), this uniqueness is no longer guaranteed--flows from different conditions may overlap, leading to more ambiguous generations. We introduce Contrastive Flow Matching, an extension to the flow matching objective that explicitly enforces uniqueness across all conditional flows, enhancing condition 

In [21]:
# Trying with category, keywords, title - return (10)
# Test 5: Category, keywords, and title
print("=== Test 5: Category + Keywords + Title ===")
results5 = arxiv_query(
    keywords="attention mechanism",
    title="transformer",
    categories=["cs.CL", "cs.LG"],
    max_results=1,
    sort_by="lastUpdatedDate",
    sort_order="descending"
)
print(f"Found {len(results5)} papers")
if results5:
    print("\nFirst paper details:")
    for key, value in results5[0].items():
        print(f"  {key}: {value}")
print("\n" + "="*50 + "\n")

=== Test 5: Category + Keywords + Title ===
Query URL: http://export.arxiv.org/api/query?search_query=%28all%3Aattention%20AND%20all%3Amechanism%29%20AND%20%28ti%3Atransformer%29%20AND%20%28cat%3Acs.CL%20OR%20cat%3Acs.LG%29&start=0&max_results=1&sortBy=lastUpdatedDate&sortOrder=descending
Attempt 1/3...
Fetched in 2.68s
Sleeping 3.0s to respect rate limits...
Parsed 1 papers
Found 1 papers

First paper details:
  id: 2506.05249v1
  version: v1
  title: On the Convergence of Gradient Descent on Learning Transformers with   Residual Connections
  summary: Transformer models have emerged as fundamental tools across various scientific and engineering disciplines, owing to their outstanding performance in diverse applications. Despite this empirical success, the theoretical foundations of Transformers remain relatively underdeveloped, particularly in understanding their training dynamics. Existing research predominantly examines isolated components--such as self-attention mechanisms and fee