In [32]:
%load_ext autoreload
%autoreload 2

import os
import copy 

import numpy as np

from astropy.table import Table

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
import sys
import time
import datetime

import xml.etree.ElementTree as ET

from urllib.parse import urlencode
from urllib.request import urlopen
from urllib.error import HTTPError

try:
    from nltk import tokenize
    use_nltk = True
except ImportError:
    use_nltk = False

OAI = '{http://www.openarchives.org/OAI/2.0/}'
ARXIV = '{http://arxiv.org/OAI/arXiv/}'
BASE = 'http://export.arxiv.org/oai2?verb=ListRecords&'

ABS_URL = "http://arxiv.org/abs/{:s}"
PDF_URL = "http://arxiv.org/pdf/{:s}.pdf"

# By default, this only works for astro-ph.
CAT = "physics:astro-ph"
SUBCAT = ['GA', 'CO', 'EP', 'HE', 'IM', 'SR']

SEARCH_TYPE = ['today', 'yesterday', 'from_yesterday', 'past_seven', 'user']

TODAY = datetime.datetime.today().replace(hour=0, minute=0, second=0)
YESTERDAY = (TODAY - datetime.timedelta(days=1)).replace(hour=0, minute=0, second=0)

In [188]:
def _filter_sub_class(papers, sub_cat, no_crosslist=True):
    """
    Filter the search results and keep the ones in certain sub-category.
    """
    if no_crosslist:
        # The first sub-category needs to be the desired one
        return [
            "astro-ph.{:s}".format(sub_cat.strip()) == p['sub_cat'].split()[0].strip() 
            for p in papers]
    return ["astro-ph.{:s}".format(sub_cat.strip()) in p['sub_cat'] for p in papers]

def _get_text(meta, tag):
    """Extracts text from an xml field"""
    try:
        return meta.find(ARXIV + tag).text.strip().replace('\n', ' ')
    except:
        return ''

def _date_str(date):
    """
    Convert the datetime into a string with '%Y-%m-%d' format.
    """
    return date.strftime('%Y-%m-%d')


def gather_dates(search_type, date_from=None, date_until=None):
    """
    Get the from and until dates for the search.
    """
    if search_type is None:
        search_type = 'user'

    search_type = search_type.lower().strip()

    if search_type not in SEARCH_TYPE:
        raise ValueError(
            "Wrong search type: ", SEARCH_TYPE)

    if search_type == 'today':
        date_from = TODAY
        date_until = TODAY
    elif search_type == 'yesterday':
        date_from = YESTERDAY
        date_until = YESTERDAY
    elif search_type == 'from_yesterday':
        date_from = YESTERDAY
        date_until = TODAY
    elif search_type == 'past_seven':
        date_from = TODAY - datetime.timedelta(days=7)
        date_until = TODAY
    else:
        if date_from is None:
            date_from = TODAY
        else:
            try:
                date_from = datetime.datetime.strptime(date_from, '%Y-%m-%d')
            except ValueError:
                print("Date format should be: YYYY-MM-DD")

        if date_until is None:
            date_until = TODAY
        else:
            try:
                date_until = datetime.datetime.strptime(date_until, '%Y-%m-%d')
            except ValueError:
                print("Date format should be: YYYY-MM-DD")

    return date_from, date_until


def organize_meta(meta):
    """
    Organize the metadata.
    """
    return {
        'id': _get_text(meta, 'id'),
        'title': _get_text(meta, 'title'),
        'abstract': _get_text(meta, 'abstract'),
        'sub_cat': _get_text(meta, 'categories'),
        'created': datetime.datetime.strptime(_get_text(meta, 'created'), '%Y-%m-%d')
    }


def scrape(url, sleep_time=30, timeout=300, verbose=True):
    """
    Get the search results.
    """
    t0, elapsed = time.time(), 0
    results, batch = [], 1

    while True:
        if verbose:
            print('Fetching up to {:d} records...'.format(1000 * batch))

        # Arxiv only allows you to scrape 1000 words at a time.
        try:
            response = urlopen(url)
        except HTTPError as e:
            if e.code == 503:
                _ = int(e.hdrs.get('retry-after', sleep_time))
                print('Got 503. Retrying after {0:d} seconds.'.format(sleep_time))
                time.sleep(sleep_time)
                continue
            else:
                raise

        batch += 1

        # Get the full XML output
        xml_output = response.read()
        xml_root = ET.fromstring(xml_output)

        # Get all the search records
        records = xml_root.findall(OAI + 'ListRecords/' + OAI + 'record')

        for record in records:
            # Get the metadata of the record
            meta = record.find(OAI + 'metadata').find(ARXIV + 'arXiv')
            results.append(meta)

        try:
            token = xml_root.find(OAI + 'ListRecords').find(OAI + 'resumptionToken')
        except:
            return 1
        if token is None or token.text is None:
            break
        else:
            url = BASE + 'resumptionToken=%s' % token.text

        t1 = time.time()
        elapsed += (t1 - t0)

        if elapsed >= timeout:
            break
        else:
            t0 = time.time()

    if verbose:
        print('Total number of records {:d}'.format(len(results)))

    return results

def astroph_abstract(output='output.md', search_type='user', date_cushion=2.5,
                     date_from=None, date_until=None, sub_cat=None,
                     verbose=False, sleep_time=30, timeout=300, no_crosslist=True):
    """
    Gather the abstracts of the astro-ph within a period of time, and output a summary
    markdown file.

    Based on: https://github.com/Mahdisadjadi/arxivscraper by Mahdisadjadi
    """
    # Get the from and unitl date
    date_f, date_u = gather_dates(search_type, date_from=date_from, date_until=date_until)

    # Form the search URL
    search_url = BASE + 'from={:s}&until={:s}&metadataPrefix=arXiv&set={:s}'.format(
        _date_str(date_f).strip(), _date_str(date_u).strip(), CAT)

    metadata = scrape(search_url, sleep_time=sleep_time, timeout=timeout, verbose=verbose)

    paper_records = [organize_meta(meta) for meta in metadata]

    # Remove the recently updated one
    # TODO: This is not perfect
    # - If someone created a preprint long before the submission, it will be left out
    papers = Table(
        [p for p in paper_records if p['created'] >= (
            date_f - datetime.timedelta(days=date_cushion))])

    # Filter the search results through sub-categories
    if isinstance(sub_cat, str):
        papers_keep = papers[_filter_sub_class(papers, sub_cat, no_crosslist=no_crosslist)]
    elif isinstance(sub_cat, list):
        papers_keep = papers[np.logical_or.reduce(
            [_filter_sub_class(papers, s, no_crosslist=no_crosslist) for s in sub_cat])]
    else:
        papers_keep = papers

    # Organize the results into markdown format (line-by-line)
    markdown_list = []

    if date_f == date_u:
        markdown_list.append("### {:s}".format(_date_str(date_f)))
    else:
        markdown_list.append("### {:s} to {:s}".format(
            _date_str(date_f), _date_str(date_u)))

    for p in papers_keep:
        abs_url = ABS_URL.format(p['id'])
        pdf_url = PDF_URL.format(p['id'])
        markdown_list.append(
            "\n##### [{:s}]({:s}) [(PDF)]({:s})\n".format(
                ' '.join(p['title'].split()), abs_url, pdf_url))

        abstract = p['abstract'].replace('\\,', ' ')
        abstract.replace('et al.', 'et al')
        if not use_nltk:
            markdown_list.append("- {:s}".format(' '.join(abstract.split())))
        else:
            sentences = tokenize.sent_tokenize(' '.join(abstract.split()))
            for s in sentences:
                markdown_list.append("- {:s}".format(s))

    # Write the markdown to file
    with open(output, 'w') as f:
        for line in markdown_list:
            f.write("{:s}\n".format(line))

    return papers_keep


In [165]:
a = astroph_abstract(
    output='output.md', search_type='today', date_from=None, date_until=None, sub_cat='GA', verbose=True)

Fetching up to 1000 records...
Total number of records 115


In [191]:
b = astroph_abstract(
    output='output.md', search_type='past_seven', sub_cat='CO', verbose=True)

Fetching up to 1000 records...
Total number of records 863


In [189]:
c = astroph_abstract(
    output='output.md', date_from='2021-01-28', date_until='2021-02-03', sub_cat='GA', verbose=True)

Fetching up to 1000 records...
Total number of records 592


In [171]:
d = astroph_abstract(
    output='output.md', search_type='today', date_from=None, date_until=None, sub_cat=['GA', 'CO', 'IM'], verbose=True)

Fetching up to 1000 records...
Total number of records 115


In [192]:
b

id,title,abstract,sub_cat,created
str10,str175,str1920,str54,object
2101.10337,Extracting Dark-Matter Velocities from Halo Masses: A Reconstruction Conjecture,"The distribution of primordial dark-matter velocities can significantly influence the growth of cosmological structure. In principle, one can therefore exploit the halo-mass distribution in order to learn about the dark sector. In practice, however, this task is both theoretically and computationally intractable. In this paper, we propose a simple one-line conjecture which can be used to ""reconstruct"" the primordial dark-matter velocity distribution directly from the shape of the halo-mass function. Although our conjecture is completely heuristic, we show that it successfully reproduces the salient features of the underlying dark-matter velocity distribution -- even for non-trivial distributions which are highly non-thermal and/or multi-modal, such as might occur for non-minimal dark sectors. Our conjecture therefore provides an operational tool for probing the dark sector which does not rely on the existence of non-gravitational couplings between dark and visible states.",astro-ph.CO hep-ph,2021-01-25 00:00:00
2101.10340,Extreme-Value Distributions and Primordial Black-Hole Formation,"We argue that primordial black-hole formation must be described by means of extreme-value theory. This is a consequence of the large values of the energy density required to initiate the collapse of black holes in the early Universe and the finite duration of their collapse. Compared to the Gaussian description of the most extreme primordial density fluctuations, the holes' mass function is narrower and peaks towards larger masses. Secondly, thanks to the shallower fall-off of extreme-value distributions, the predicted abundance of primordial black holes is boosted by $10^{7}$ orders of magnitude when extrapolating the observed nearly scale-free power spectrum of the cosmic large-scale structure to primordial black-hole mass scales.",astro-ph.CO gr-qc hep-ph hep-th math-ph math.MP,2021-01-25 00:00:00
2101.10360,A convolutional-neural-network estimator of CMB constraints on dark matter energy injection,"We show that the impact of energy injection by dark matter annihilation on the cosmic microwave background power spectra can be apprehended via a residual likelihood map. By resorting to convolutional neural networks that can fully discover the underlying pattern of the map, we propose a novel way of constraining dark matter annihilation based on the Planck 2018 data. We demonstrate that the trained neural network can efficiently predict the likelihood and accurately place bounds on the annihilation cross-section in a $\textit{model-independent}$ fashion. The machinery will be made public in the near future.",astro-ph.CO hep-ph,2021-01-25 00:00:00
2101.10714,Relieving the $H_0$ tension with a new interacting dark energy model,"We investigate an extended cosmological model motivated by the asymptotic safety of gravitational field theory, in which the matter and radiation densities and the cosmological constant receive a correction parametrized by the parameters $\delta_G$ and $\delta_\Lambda$, leading to that both the evolutions of the matter and radiation densities and the cosmological constant slightly deviate from the standard forms. Here we explain this model as a scenario of vacuum energy interacting with matter and radiation. We consider two cases of the model: (i) ${\tilde\Lambda}$CDM with one additional free parameter $\delta_G$ and (ii) e${\tilde\Lambda}$CDM with two additional free parameters $\delta_G$ and $\delta_\Lambda$. We use two data combinations, CMB+BAO+SN (CBS) and CMB+BAO+SN+$H_0$ (CBSH), to constrain the models. We find that, in the case of using the CBS data, neither ${\tilde\Lambda}$CDM nor e${\tilde\Lambda}$CDM can effectively alleviate the $H_0$ tension. However, it is found that using the CBSH data the $H_0$ tension can be greatly relieved by the models. In particular, in the case of e${\tilde\Lambda}$CDM, the $H_0$ tension can be resolved to 0.6$\sigma$. We find that as an interacting dark energy model, ${\tilde\Lambda}$CDM is much better than $\Lambda(t)$CDM in the sense of both relieving the $H_0$ tension and fitting to the current observational data.",astro-ph.CO gr-qc hep-ph,2021-01-26 00:00:00
2101.11014,The cosmology dependence of galaxy clustering and lensing from a hybrid $N$-body-perturbation theory model,"We implement a model for the two-point statistics of biased tracers that combines dark matter dynamics from $N$-body simulations with an analytic Lagrangian bias expansion. Using Aemulus, a suite of $N$-body simulations built for emulation of cosmological observables, we emulate the cosmology dependence of these nonlinear spectra from redshifts $z = 0$ to $z=2$. We quantify the accuracy of our emulation procedure, which is sub-per cent at $k=1\, h {\rm Mpc}^{-1}$ for the redshifts probed by upcoming surveys and improves at higher redshifts. We demonstrate its ability to describe the statistics of complex tracer samples, including those with assembly bias and baryonic effects, reliably fitting the clustering and lensing statistics of such samples at redshift $z\simeq 0.4$ to scales of $k_{\rm max} \approx 0.6\, h\mathrm{Mpc}^{-1}$. We show that the emulator can be used for unbiased cosmological parameter inference in simulated joint clustering and galaxy--galaxy lensing analyses with data drawn from an independent $N$-body simulation. These results indicate that our emulator is a promising tool that can be readily applied to the analysis of current and upcoming datasets from galaxy surveys.",astro-ph.CO astro-ph.IM,2021-01-26 00:00:00
2101.11016,Explaining Excess Dipole in NVSS Data Using Superhorizon Perturbation,"Many observations in recent times have shown evidence against the standard assumption of isotropy in the Big Bang model. Introducing a superhorizon scalar metric perturbation has been able to explain some of these anomalies. In this work, we probe the net velocity arising due to the perturbation, which does not cancel out for large scale structure, unlike in the case of CMB. Thus, within this model's framework, our velocity with respect to the CMB is different from the velocity with respect to the large scale structure. Taking this extra velocity component into account, we study the superhorizon mode's implications for the excess dipole observed in the NRAO VLA Sky Survey (NVSS). We find that the mode can consistently explain both the CMB and NVSS observations. We also find that the model is consistent with the observed Hubble constant dipole and the Hubble bulk flow velocity. The model leads to several predictions which can be tested in future surveys. In particular, it implies that the observed dipole in large scale structure should be redshift dependent and should show an increase in amplitude with redshift. We also find that the Hubble parameter should show a dipole anisotropy whose amplitude must increase with redshift in the CMB frame. Similar anisotropic behaviour is expected for the observed redshift as a function of the luminosity distance.",astro-ph.CO,2021-01-26 00:00:00
2101.11088,"CosmoReionMC: A package for estimating cosmological and astrophysical parameters using CMB, Lyman-{\alpha} absorption and global 21 cm data","We present a Markov Chain Monte Carlo (MCMC)-based parameter estimation package, CosmoReionMC, to jointly constrain cosmological parameters of the $\Lambda$CDM model and the astrophysical parameters related to hydrogen reionization. The package is based on a previously developed physically motivated semi-analytical model for reionization, a similar semi-analytical model for computing the global 21~cm signal during the cosmic dawn and using an appropriately modified version of the publicly available CAMB for computing the CMB anisotropies. These calculations are then coupled to an MCMC ensemble sampler \texttt{emcee} to compute the posterior distributions of the model parameter. The model has twelve free parameters in total: five cosmological and seven related to the stellar populations. We constrain the parameters by matching the theoretical predictions with CMB data from Planck, observations related to the quasar absorption spectra and, for the first time, the global 21~cm signal from EDGES. We find that incorporating the quasar spectra data in the analysis tightens the bounds on the electron scattering optical depth $\tau$ and consequently the normalization $A_s$ of the primordial matter power spectrum (or equivalently $\sigma_8$). Furthermore, when we include the EDGES data in the analysis, we find that an early population of metal-free stars with efficient radio emission is necessary to match the absorption amplitude. The CosmoReionMC package should have interesting future applications, e.g., probing non-standard extensions to the $\Lambda$CDM model.",astro-ph.CO,2021-01-26 00:00:00
2101.11098,A possible mass distribution of primordial black holes implied by LIGO-Virgo,"The LIGO-Virgo Collaboration has so far detected around 90 black holes, some of which have masses larger than what were expected from the collapse of stars. The mass distribution of LIGO-Virgo black holes appears to have a peak at $\sim30M_{\odot}$ and two tails on the ends. By assuming that they all have a primordial origin, we analyze the GWTC-1 (O1\&O2) and GWTC-2 (O3a) datasets by performing maximum likelihood estimation on a broken power law mass function $f(m)$, with the result $f\propto m^{1.2}$ for $m<35M_{\odot}$ and $f\propto m^{-4}$ for $m>35M_{\odot}$. This appears to behave better than the popular log-normal mass function. Surprisingly, such a simple and unique distribution can be realized in our previously proposed mechanism of PBH formation, where the black holes are formed by vacuum bubbles that nucleate during inflation via quantum tunneling. Moreover, this mass distribution can also provide an explanation to supermassive black holes formed at high redshifts.",astro-ph.CO gr-qc hep-th,2021-01-26 00:00:00
2101.11181,A Generative Model of Galactic Dust Emission Using Variational Inference,"Emission from the interstellar medium can be a significant contaminant of measurements of the intensity and polarization of the cosmic microwave background (CMB). For planning CMB observations, and for optimizing foreground-cleaning algorithms, a description of the statistical properties of such emission can be helpful. Here we examine a machine learning approach to inferring the statistical properties of dust from either observational data or physics-based simulations. In particular, we apply a type of neural network called a Variational Auto Encoder (VAE) to maps of the intensity of emission from interstellar dust as inferred from Planck sky maps and demonstrate its ability to a) simulate new samples with similar summary statistics as the training set, b) provide fits to emission maps withheld from the training set, and c) produce constrained realizations. We find VAEs are easier to train than another popular architecture: that of Generative Adversarial Networks (GANs), and are better-suited for use in Bayesian inference.",astro-ph.CO astro-ph.GA,2021-01-26 00:00:00
2101.11244,Gravitational waves from type II axion-like curvaton model and its implication for NANOGrav result,"The recent report of NANOGrav is gathering attention since its signal can be explained by the stochastic gravitational waves (GWs) with $\Omega_{\rm GW}\sim 10^{-9}$ at $f\sim 10^{-8}$Hz. The PBH formation scenario is one of the candidates for the NANOGrav signal, which can simultaneously explain the observed $30 M_\odot$ black holes in the binary merger events in LIGO-Virgo collaboration. We focus on the type II axion-like curvaton model of the PBH formation. In type II model the complex field whose phase part is the axion rolls down from the origin of the potential. It is found that type II model achieves the broad power spectrum of the density perturbations and can simultaneously explain the LIGO-Virgo events and the NANOGrav signal. We also improve the treatment of the non-Gaussianity of perturbations in our model to estimate the amplitude of the induced GWs precisely.",astro-ph.CO hep-ph,2021-01-27 00:00:00


In [183]:
sub_cat = c[1]['sub_cat']

In [184]:
sub_cat.split()

['astro-ph.GA']

In [186]:
'astro-ph.GA' == c[1]['sub_cat'].split()[0].strip()

True