In [2]:
"""
python_arXiv_parsing_example.py

This sample script illustrates a basic arXiv api call
followed by parsing of the results using the
feedparser python module.

Please see the documentation at
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api
mailing list at arxiv-api@googlegroups.com.

urllib is included in the standard python library.
feedparser can be downloaded from http://feedparser.org/ .

Author: Julius B. Lucks

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!
"""

import urllib.request
import feedparser

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# https://arxiv.org/search/?query=LLM+Recommend&searchtype=all&source=header
# Search parameters
search_query = 'all:LLM+Recommend'  # search for electron in all fields
start = 0  # retreive the first 5 results
max_results = 5

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# Opensearch metadata such as totalResults, startIndex,
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
# feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
# feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url + query).read()
print("respnse:",response)
# parse the response using feedparser
feed = feedparser.parse(response)

# print out feed information
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)

# print opensearch metadata
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)

print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)

print('startIndex for this query: %s' % feed.feed.opensearch_startindex)


# Run through each entry, and print out information
for entry in feed.entries:
    print('e-print metadata')

    print('arxiv-id: %s' % entry.id.split('/abs/')[-1])

    print('Published: %s' % entry.published)

    print('Title:  %s' % entry.title)


    # feedparser v4.1 only grabs the first author
    author_string = entry.author

    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass

    print('Last Author:  %s' % author_string)


    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print('Authors:  %s' % ', '.join(author.name for author in entry.authors))

    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print('abs page link: %s' % link.href)

        elif link.title == 'pdf':
            print('pdf link: %s' % link.href)


    # The journal reference, comments and primary_category sections live under
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print('Journal reference: %s' % journal_ref)


    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print('Comments: %s' % comment)


    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print('Primary Category: %s' % entry.tags[0]['term'])


    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print('All Categories: %s' % (', ').join(all_categories))


    # The abstract is in the <summary> element
    print('Abstract: %s' % entry.summary)



respnse: b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3ALLM%20Recommend%26id_list%3D%26start%3D0%26max_results%3D5" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:LLM Recommend&amp;id_list=&amp;start=0&amp;max_results=5</title>\n  <id>http://arxiv.org/api/ohH+G+JroC1dnlYVAHDciywx8yY</id>\n  <updated>2024-06-18T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">31739</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">5</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/2312.13557v1</id>\n    <updated>2023-12-21T03:50:09Z</updated>\n    <published>2023-12-21T03:50:09Z</published>\n    <title>Empowering Few-

In [None]:
class Search_param(str):
    """
    搜索参数管理类，用于拼接在baseurl之后

    可以由generate方法encode生成初次检索所需的URL后缀
    例如：”?term=alzheimer%27s+disease“
    可以附加一些调整网页的属性

    TODO: 按日期搜索，
    2020.1.1 - 2020.1.31: 
    https://pubmed.ncbi.nlm.nih.gov/?term=computer+science&filter=dates.2020%2F1%2F1-2020%2F1%2F31

    2024.5.1 - 2024.5.31
    https://pubmed.ncbi.nlm.nih.gov/?term=computer+science&filter=dates.2024%2F5%2F1-2024%2F5%2F31
    """
    def __init__(self, keywords:str):
        self.search_keywords = {}
        self.search_keywords['term'] = keywords.strip()

    def gen_search_param(self) -> str:
        # encode url生成request需要的url
        return urllib.parse.urlencode(self.search_keywords)

    def specify_web_size(self, size: int):
        # 调整 搜索页面的大小
        self.search_keywords['size'] = size

    def specify_any_param(self, key: str, value):
        """
        针对任意参数进行调整, 需要提供合适的键值对，默认不存在
        目前观察到的有：sort(date, pubdate, fauth, jour), sort_order(asc) 更多参数请查看pubmed的搜索URL
        :param key: url链接需要添加的键
        :param value: url链接中键对应的值
        :return:
        """
        self.search_keywords[key] = value

def scrawl_from_arxiv(keywords: str = None):
    base_url = 'http://export.arxiv.org/api/query?'
    param = Search_param(keywords)

    search_query = param.gen_search_param()
    
    start = 0  # retreive the first 5 results
    max_results = 5

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)