Skip to content

Commit

Permalink
Add github
Browse files Browse the repository at this point in the history
  • Loading branch information
charmoniumQ committed Mar 17, 2022
1 parent 7d7a3ec commit 16e9859
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 64 deletions.
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,8 @@ guide`_.
$ pip install ascl_net_scraper
>>> import ascl_net_scraper
>>> codes = ascl_net_scraper.scrape_index(5)
>>> codes[0]
CodeRecord(ascl_id=None, title='2-DUST: Dust radiative transfer code', credit=['Ueta, Toshiya'], abstract='<p>...</p>', details_url='https://ascl.net/1604.006')
>>> codes[0].get_details()
DetailedCodeRecord(ascl_id=None, title='2-DUST: Dust radiative transfer code', credit=['Ueta, Toshiya'], abstract='<p>...</p>', url='https://ascl.net/1604.006', code_sites=['https://github.com/sundarjhu/2-DUST/'], used_in=['https://ui.adsabs.harvard.edu/abs/2004ApJ...614..371M'], described_in=['https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U'], bibcode='2016ascl.soft04006U', preferred_citation_method='<p><a href="https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U">https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U</a></p>', discuss_url='/phpBB3/viewtopic.php?t=33976', views=...)
9 changes: 4 additions & 5 deletions ascl_net_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
__version__ = "0.2.0"
from .main import CodeRecord as CodeRecord
from .main import DetailedCodeRecord as DetailedCodeRecord
from .main import scrape_details as scrape_details
from .main import scrape_index_lazy as scrape_index_lazy
from .main import scrape_index_list as scrape_index_list
from .lib import CodeRecord as CodeRecord
from .lib import DetailedCodeRecord as DetailedCodeRecord
from .lib import scrape_details as scrape_details
from .lib import scrape_index as scrape_index
29 changes: 4 additions & 25 deletions ascl_net_scraper/__main__.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,7 @@
import logging
import os

from tqdm import tqdm

from .main import scrape_index_list, scrape_details
from .lib import scrape_index

# __name__ == "__main__" is needed so pytest ignores this.
if __name__ == "__main__":
logger = logging.getLogger("charmonium.freeze")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("freeze.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(fh)
logger.debug("Program {}", os.getpid())

logger = logging.getLogger("charmonium.cache.ops")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("cache.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(fh)
logger.debug("Program {}", os.getpid())

records = scrape_index_list(10)
for record in tqdm(records, total=len(records)):
detailed_record = scrape_details(record.details_url)
results = scrape_index(100)
for result in results:
result.get_details()
90 changes: 63 additions & 27 deletions ascl_net_scraper/main.py → ascl_net_scraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,30 @@

import re
from dataclasses import dataclass
from typing import (
Iterable,
List,
Mapping,
Optional,
Tuple,
TypeVar,
cast,
)
from typing import Iterable, List, Mapping, Optional, Tuple, TypeVar, cast

import bs4
import requests
from charmonium.cache import MemoizedGroup, memoize
from tqdm import tqdm

DEFAULT_PARSER = "html5lib"
group = MemoizedGroup(fine_grain_persistence=True)
group = MemoizedGroup(fine_grain_persistence=True, size="10MiB")


@dataclass
class CodeRecord:
"""Information about a code representing one entry in <https://ascl.net/code/all>."""

ascl_id: Optional[Tuple[int, int]]
title: str
credit: List[str]
abstract: str
details_url: str

def get_details(self) -> DetailedCodeRecord:
return cast(DetailedCodeRecord, scrape_details(self.details_url))


def parse_ascl_id(ascl_id_str: str) -> Optional[Tuple[int, int]]:
if m := re.match(r"\[ascl:(\d+).(\d+)\]", ascl_id_str):
Expand All @@ -46,23 +44,27 @@ def unwrap(obj: Optional[_T]) -> _T:


@memoize(group=group)
def scrape_index_list(
def scrape_index(
max_count: Optional[int] = None,
verbose: bool = True,
parser: str = DEFAULT_PARSER,
) -> List[CodeRecord]:
return list(scrape_index_lazy(max_count, verbose, parser))
"""Get `max_count` entries from <https://ascl.net/code/all>.
Pass `None` to get all entries.
This function caches the result, so if it is called with the same `max_count`, the result can be loaded from disk.
"""
return list(scrape_index_lazy(max_count, verbose))


def scrape_index_lazy(
max_count: Optional[int] = None,
verbose: bool = True,
parser: str = DEFAULT_PARSER,
) -> Iterable[CodeRecord]:
max_count = max_count if max_count is not None else 300000
# 300000 ~ 100 * current value on 2022-03-10
response = requests.get(f"https://ascl.net/code/all/limit/{max_count}")
soup = bs4.BeautifulSoup(response.text, parser)
soup = bs4.BeautifulSoup(response.text, DEFAULT_PARSER)
items = list(soup.select("div.codelist div.item"))
u = unwrap
item: bs4.Tag
Expand All @@ -75,46 +77,78 @@ def scrape_index_lazy(
child.text for child in u(item.select_one("div.credit a")).children
],
abstract=str(u(item.select_one("p"))),
details_url="https://ascl.net/"
+ u(item.select_one("span.title a")).attrs["href"],
details_url=(
"https://ascl.net/" + u(item.select_one("span.title a")).attrs["href"]
).replace(".net//", ".net/"),
)


github_regex = re.compile(r"(https?://github.com/[a-zA-Z0-9\.\-]*/[a-zA-Z0-9\.\-]*)")


@dataclass
class DetailedCodeRecord:
"""Detailed information about a code, for example <https://ascl.net/0000.000>."""

ascl_id: Optional[Tuple[int, int]]
title: str
credit: List[str]
abstract: str
url: str
code_site: List[str]
code_sites: List[str]
used_in: List[str]
described_in: List[str]
bibcode: Optional[str]
preferred_citation_method: Optional[str]
discuss_url: str
views: int

@staticmethod
def from_code_record(code_record: CodeRecord) -> DetailedCodeRecord:
return cast(DetailedCodeRecord, scrape_details(code_record.details_url))
@property
def github(self) -> Optional[str]:
return cast(Optional[str], get_github_for(self))

@memoize(group=group)
def get_github_for(record: DetailedCodeRecord) -> Optional[str]:
# First, see if any code_site is a github site.
for site in record.code_sites:
if re.match(github_regex, site):
return site

# Second, see if any code_site is part of a github site.
# This includes https://github.com/author/ (no repo)
# and https://github.com/author/repo/blob/main/path (subpath in repo)
for site in record.code_sites:
if re.match(r"https?://github.com/[a-zA-Z0-9\.\-\/]*", site):
return site

# Second, see if any code_site links to a github site.
for site in record.code_sites:
try:
# A lot of old sites take forever to time out.
text = requests.get(site, timeout=4).text
except requests.exceptions.RequestException:
# A lot of old sites are dead.
continue
if match := re.search(github_regex, text):
return match.group(0)

# Third, give up.
return None

def dl_to_dict(dl: bs4.Tag) -> Mapping[str, bs4.Tag]:
children = list(dl.children)
return {
key.text: cast(bs4.Tag, val)
for key, val in zip(children[::2], children[1::2])
for key, val in zip(dl.find_all("dt"), dl.find_all("dd"))
}


@memoize(group=group)
def scrape_details(
url: str,
parser: str = DEFAULT_PARSER,
) -> DetailedCodeRecord:
"""Get detailed information about a code."""
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, parser)
soup = bs4.BeautifulSoup(response.text, DEFAULT_PARSER)
item = soup.select_one("div.codelist div.item")
assert item
sites_soup = item.select_one("dl.sites")
Expand All @@ -128,7 +162,7 @@ def scrape_details(
credit=[child.text for child in u(item.select_one("div.credit a")).children],
abstract=str(item.select_one("p")),
url=url,
code_site=(
code_sites=(
[link.attrs["href"] for link in sites["Code site:"].select("a")]
if "Code site:" in sites
else []
Expand All @@ -144,7 +178,9 @@ def scrape_details(
else []
),
bibcode=bibcode.text if bibcode else None,
preferred_citation_method=str(cite_method) if cite_method else None,
preferred_citation_method=str(cite_method.select_one("p"))
if cite_method
else None,
discuss_url=u(item.select_one("div.discuss > a")).attrs["href"],
views=int(u(item.select_one("div.views")).text[7:]),
)
10 changes: 5 additions & 5 deletions script.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
cast,
)

#import autoimport
# import autoimport
import isort
import setuptools
import toml
Expand Down Expand Up @@ -338,10 +338,10 @@ def inner() -> Iterable[List[str]]:

@app.command()
def publish(
version_part: VersionPart,
verify: bool = True,
docs: bool = True,
bump: bool = True,
version_part: VersionPart,
verify: bool = True,
docs: bool = True,
bump: bool = True,
) -> None:
if verify:
asyncio.run(all_tests_inner(True))
Expand Down
7 changes: 5 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from tqdm import tqdm
from ascl_net_scraper import __version__, scrape_details, scrape_index_list
import shutil
shutil.rmtree(".cache")
from ascl_net_scraper import __version__, scrape_details, scrape_index


def test_main() -> None:
records = scrape_index_list(10)
records = scrape_index(10)
for record in tqdm(records, total=len(records)):
detailed_record = scrape_details(record.details_url)
assert record.ascl_id == detailed_record.ascl_id
assert record.title == detailed_record.title
assert record.credit == detailed_record.credit
assert record.abstract == detailed_record.abstract
assert record.details_url == detailed_record.url
detailed_record.github

0 comments on commit 16e9859

Please sign in to comment.