In [3]:
import feedparser

# 设置arXiv的查询参数
query = 'cat:nlp AND cat:cs.CL AND submittedDate:[2020-01-01 TO 2020-05-31] AND title:benchmark'
arxiv_feed = f'http://export.arxiv.org/api/query?search_query={query}&start=0&max_results=10'

# 解析arXiv的数据源
feed = feedparser.parse(arxiv_feed)

# 输出搜索结果
for entry in feed.entries:
    print('Title:', entry.title)
    print('Authors:', ', '.join(author.name for author in entry.authors))
    print('Summary:', entry.summary)
    print('Link:', entry.link)
    print('Publish Date:', entry.published)
    print('\n')

InvalidURL: URL can't contain control characters. '/api/query?search_query=cat:nlp AND cat:cs.CL AND submittedDate:[2020-01-01 TO 2020-05-31] AND title:benchmark&start=0&max_results=10' (found at least ' ')

In [4]:

import urllib.request
import feedparser

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:electron&cat:cs.CL'  # search for electron in all fields
start = 0  # retreive the first 5 results
max_results = 5

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# Opensearch metadata such as totalResults, startIndex,
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
# feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
# feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url + query).read()
print("respnse:",response)
# parse the response using feedparser
feed = feedparser.parse(response)

# print out feed information
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)

# print opensearch metadata
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)

print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)

print('startIndex for this query: %s' % feed.feed.opensearch_startindex)


# Run through each entry, and print out information
for entry in feed.entries:
    print('e-print metadata')

    print('arxiv-id: %s' % entry.id.split('/abs/')[-1])

    print('Published: %s' % entry.published)

    print('Title:  %s' % entry.title)


    # feedparser v4.1 only grabs the first author
    author_string = entry.author

    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass

    print('Last Author:  %s' % author_string)


    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print('Authors:  %s' % ', '.join(author.name for author in entry.authors))

    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print('abs page link: %s' % link.href)

        elif link.title == 'pdf':
            print('pdf link: %s' % link.href)


    # The journal reference, comments and primary_category sections live under
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print('Journal reference: %s' % journal_ref)


    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print('Comments: %s' % comment)


    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print('Primary Category: %s' % entry.tags[0]['term'])


    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print('All Categories: %s' % (', ').join(all_categories))


    # The abstract is in the <summary> element
    print('Abstract: %s' % entry.summary)
————————————————

                            版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。
                        
原文链接：https://blog.csdn.net/ye6pipipihou/article/details/127170216

In [1]:
import feedparser

# 定义搜索关键词
query = 'benchmark'

# 定义arXiv搜索的URL
url = f'http://export.arxiv.org/api/query?search_query=all:{query}+AND+submittedDate:[20200101+TO+20200531]&start=0&max_results=10'

# 使用feedparser库解析URL
feed = feedparser.parse(url)

# 输出搜索结果
for entry in feed.entries:
    print("Title:", entry.title)
    print("Authors:", entry.author)
    print("Summary:", entry.summary)
    print("Published Date:", entry.published)
    print("arXiv Link:", entry.link)
    print()

Title: Benchmarking Graph Data Management and Processing Systems: A Survey
Authors: Toyotaro Suzumura
Summary: The development of scalable, representative, and widely adopted benchmarks
for graph data systems have been a question for which answers has been sought
for decades. We conduct an in-depth study of the existing literature on
benchmarks for graph data management and processing, covering 20 different
benchmarks developed during the last 15 years. We categorize the benchmarks
into three areas focusing on benchmarks for graph processing systems, graph
database benchmarks, and bigdata benchmarks with graph processing workloads.
This systematic approach allows us to identify multiple issues existing in this
area, including i) few benchmarks exist which can produce high workload
scenarios, ii) no significant work done on benchmarking graph stream processing
as well as graph based machine learning, iii) benchmarks tend to use
conventional metrics despite new meaningful metrics have be