In [21]:
import urllib.request as libreq
import feedparser

In [22]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

In [23]:
# Search parameters
search_query = 'all:language model' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

In [24]:
request = base_url + 'search_query=' + search_query + '&start=' + str(start) + '&max_results=' + str(max_results)

In [25]:
#with libreq.urlopen('http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=2') as url:
with libreq.urlopen(request) as url:
    response = url.read()
feed = feedparser.parse(response)

In [26]:
# print out feed information
print('Feed title: {}'.format(feed.feed.title))
print('Feed last updated: {}'.format(feed.feed.updated))

# print opensearch metadata
print('totalResults for this query: {}'.format(feed.feed.opensearch_totalresults))
print('itemsPerPage for this query: {}'.format(feed.feed.opensearch_itemsperpage))
print('startIndex for this query: {}'.format(feed.feed.opensearch_startindex))

# Run through each entry, and print out information
for entry in feed.entries:
    print('--------------------')
    print('e-print metadata')
    print('arxiv-id: {}'.format(entry.id.split('/abs/')[-1]))
    print('Published: {}'.format(entry.published))
    print('Title:  {}'.format(entry.title))
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' ({})'.format(entry.arxiv_affiliation)
    except AttributeError:
        pass
    
    print('Last Author:  {}'.format(author_string))
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print('Authors:  {}'.format(', '.join(author.name for author in entry.authors)))
    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print('abs page link: {}'.format(link.href))
        elif link.title == 'pdf':
            print('pdf link: {}'.format(link.href))
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print('Journal reference: {}'.format(journal_ref))
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print('Comments: {}'.format(comment))
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print('Primary Category: {}'.format(entry.tags[0]['term']))
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print('All Categories: {}'.format(', '.join(all_categories)))
    
    # The abstract is in the <summary> element
    print('Abstract: {}'.format(entry.summary))

Feed title: ArXiv Query: search_query=all:language&id_list=&start=0&max_results=10
Feed last updated: 2020-05-16T00:00:00-04:00
totalResults for this query: 40899
itemsPerPage for this query: 10
startIndex for this query: 0
--------------------
e-print metadata
arxiv-id: 1604.08561v1
Published: 2016-04-28T19:10:47Z
Title:  Comparing Fifty Natural Languages and Twelve Genetic Languages Using
  Word Embedding Language Divergence (WELD) as a Quantitative Measure of
  Language Distance
Last Author:  Mohammad R. K. Mofrad
Authors:  Ehsaneddin Asgari, Mohammad R. K. Mofrad
abs page link: http://arxiv.org/abs/1604.08561v1
pdf link: http://arxiv.org/pdf/1604.08561v1
Journal reference: No journal ref found
Comments: No comment found
Primary Category: cs.CL
All Categories: cs.CL
Abstract: We introduce a new measure of distance between languages based on word
embedding, called word embedding language divergence (WELD). WELD is defined as
divergence between unified similarity distribution of words