In [1]:
import feedparser
from urllib.parse import urlencode
import datetime
from dateutil.relativedelta import relativedelta

In [2]:
root_url = 'http://export.arxiv.org/api/'

keywords = ['cat: stat.ML']
start=0
max_results = 50
sort_by='submittedDate'
sort_order="descending"

days = 1
prune = True
debug = False

# paper_num_result = []
# dict_key_keyword = 'keyword'
# dict_key_n_of_papers = 'n'

In [3]:
keyword = keywords[0]

In [4]:
def make_list(root_url, keyword, prune, start, max_results, sort_by, sort_order, days, debug):
    result = query(
        root_url = root_url,
        search_query=keyword,
        prune=prune,
        start=start,
        max_results=max_results,
        sort_by=sort_by,
        sort_order=sort_order
    )
    return select_recent_papers(result, days=days, debug=debug)


def query(root_url,
                  search_query,
                  prune,
                  start,
                  max_results,
                  sort_by,
                  sort_order):
    url_args = urlencode({"search_query": search_query,
                                          "start": start,
                                          "max_results": max_results,
                                          "sortBy": sort_by,
                                          "sortOrder": sort_order})
    results = feedparser.parse(root_url + 'query?' + url_args)
    if results.get('status') != 200:
        raise Exception(
            "HTTP Error " + str(results.get('status', 'no status')) + " in query")
    else:
        results = results['entries']
    for result in results:
        modify_query_result(result)
        if prune:
            prune_query_result(result)
    return results


def modify_query_result(result):
    result['pdf_url'] = None
    for link in result['links']:
        if 'title' in link and link['title'] == 'pdf':
            result['pdf_url'] = link['href']
    result['affiliation'] = result.pop('arxiv_affiliation', 'None')
    result['arxiv_url'] = result.pop('link')
    result['title'] = result['title'].rstrip('\n')
    result['summary'] = result['summary'].rstrip('\n')
    result['authors'] = [d['name'] for d in result['authors']]
    if 'arxiv_comment' in result:
        result['arxiv_comment'] = result['arxiv_comment'].rstrip('\n')
    else:
        result['arxiv_comment'] = None
    if 'arxiv_journal_ref' in result:
        result['journal_reference'] = result.pop('arxiv_journal_ref')
    else:
        result['journal_reference'] = None
    if 'arxiv_doi' in result:
        result['doi'] = result.pop('arxiv_doi')
    else:
        result['doi'] = None
        
def prune_query_result(result):
    prune_keys = ['updated_parsed',
                              'arxiv_primary_category',
                              'summary_detail',
                              'author',
                              'author_detail',
                              'links',
                              'guidislink',
                              'title_detail',
                              'tags',
                              'id']
    for key in prune_keys:
        try:
            del result[key]
        except KeyError:
            pass

def select_recent_papers(from_papers_list, days, debug):
    today = datetime.datetime.today()
    utc_today = today - relativedelta(hours=9)

    from_when = utc_today - relativedelta(days=days)
    to_when = utc_today

    if debug:
        print('JST: ', today)
        print('UTC_from: ', from_when)
        print('UTC_to  : ', to_when)
        print()
        print('recent papers\' timestamps are like below:')
        for paper in from_papers_list:
            print(paper['published'])

    return list(filter(lambda x: condition_to_select_papers(x, from_when, to_when), from_papers_list))

def condition_to_select_papers(paper, from_when, to_when):
    return from_when <= datetime.datetime(*paper['published_parsed'][:6]) < to_when

In [5]:
for keyword in keywords:
    arxiv_lists = make_list(root_url, keyword, prune, start, max_results, sort_by, sort_order, days, debug)

In [6]:
print(len(arxiv_lists))
arxiv_lists[0]

16


{'updated': '2020-06-03T17:54:26Z',
 'published': '2020-06-03T17:54:26Z',
 'published_parsed': time.struct_time(tm_year=2020, tm_mon=6, tm_mday=3, tm_hour=17, tm_min=54, tm_sec=26, tm_wday=2, tm_yday=155, tm_isdst=0),
 'title': 'Equivariant Flows: exact likelihood generative learning for symmetric\n  densities',
 'summary': 'Normalizing flows are exact-likelihood generative neural networks which\napproximately transform samples from a simple prior distribution to samples of\nthe probability distribution of interest. Recent work showed that such\ngenerative models can be utilized in statistical mechanics to sample\nequilibrium states of many-body systems in physics and chemistry. To scale and\ngeneralize these results, it is essential that the natural symmetries in the\nprobability density - in physics defined by the invariances of the target\npotential - are built into the flow. We provide a theoretical sufficient\ncriterion showing that the distribution generated by equivariant normal