Skip to content

Commit

Permalink
Fix types
Browse files Browse the repository at this point in the history
  • Loading branch information
charmoniumQ committed Mar 15, 2022
1 parent 2633aff commit b03ef32
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 4 deletions.
5 changes: 5 additions & 0 deletions ascl_net_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
__version__ = "0.1.0"
from .main import CodeRecord as CodeRecord
from .main import DetailedCodeRecord as DetailedCodeRecord
from .main import scrape_details as scrape_details
from .main import scrape_index_lazy as scrape_index_lazy
from .main import scrape_index_list as scrape_index_list
168 changes: 168 additions & 0 deletions ascl_net_scraper/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from __future__ import annotations

import re
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Iterable,
List,
Mapping,
Optional,
Tuple,
TypeVar,
cast,
)

import bs4
import requests
from charmonium.cache import MemoizedGroup, memoize
from tqdm import tqdm

DEFAULT_PARSER = "html5lib"
group = MemoizedGroup(fine_grain_persistence=True)

import logging

logger = logging.getLogger("charmonium.freeze")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("freeze.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s"))
logger.addHandler(fh)

logger = logging.getLogger("charmonium.cache.ops")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("cache.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s: %(message)s"))
logger.addHandler(fh)


@dataclass
class CodeRecord:
ascl_id: Optional[Tuple[int, int]]
title: str
credit: List[str]
abstract: str
details_url: str


def parse_ascl_id(ascl_id_str: str) -> Optional[Tuple[int, int]]:
if m := re.match(r"\[ascl:(\d+).(\d+)\]", ascl_id_str):
return (int(m.group(1)), int(m.group(2)))
else:
return None


_T = TypeVar("_T")


def unwrap(obj: Optional[_T]) -> _T:
if obj is None:
raise ValueError("Unable to parse page")
return obj


@memoize(group=group)
def scrape_index_list(
max_count: Optional[int] = None,
verbose: bool = True,
parser: str = DEFAULT_PARSER,
) -> List[CodeRecord]:
return list(scrape_index_lazy(max_count, verbose, parser))


def scrape_index_lazy(
max_count: Optional[int] = None,
verbose: bool = True,
parser: str = DEFAULT_PARSER,
) -> Iterable[CodeRecord]:
max_count = max_count if max_count is not None else 300000
# 300000 ~ 100 * current value on 2022-03-10
response = requests.get(f"https://ascl.net/code/all/limit/{max_count}")
soup = bs4.BeautifulSoup(response.text, parser)
items = list(soup.select("div.codelist div.item"))
u = unwrap
item: bs4.Tag
for item in tqdm(iter(items), total=len(items), disable=not verbose):
assert item
yield CodeRecord(
ascl_id=parse_ascl_id(u(item.select_one("span.ascl_id")).text),
title=u(item.select_one("span.title")).text.strip(),
credit=[
child.text for child in u(item.select_one("div.credit a")).children
],
abstract=str(u(item.select_one("p"))),
details_url="https://ascl.net/"
+ u(item.select_one("span.title a")).attrs["href"],
)


@dataclass
class DetailedCodeRecord:
ascl_id: Optional[Tuple[int, int]]
title: str
credit: List[str]
abstract: str
url: str
code_site: List[str]
used_in: List[str]
described_in: List[str]
bibcode: Optional[str]
preferred_citation_method: Optional[str]
discuss_url: str
views: int

@staticmethod
def from_code_record(code_record: CodeRecord) -> DetailedCodeRecord:
return cast(DetailedCodeRecord, scrape_details(code_record.details_url))


def dl_to_dict(dl: bs4.Tag) -> Mapping[str, bs4.Tag]:
children = list(dl.children)
return {
key.text: cast(bs4.Tag, val)
for key, val in zip(children[::2], children[1::2])
}


@memoize(group=group)
def scrape_details(
url: str,
parser: str = DEFAULT_PARSER,
) -> DetailedCodeRecord:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, parser)
item = soup.select_one("div.codelist div.item")
assert item
sites_soup = item.select_one("dl.sites")
sites = dl_to_dict(sites_soup) if sites_soup else {}
bibcode = item.select_one("dl.sites.bibcode > dd")
cite_method = item.select_one("div.cite_method")
u = unwrap
return DetailedCodeRecord(
ascl_id=parse_ascl_id(u(item.select_one("span.ascl_id")).text),
title=u(item.select_one("span.title")).text.strip(),
credit=[child.text for child in u(item.select_one("div.credit a")).children],
abstract=str(item.select_one("p")),
url=url,
code_site=(
[link.attrs["href"] for link in sites["Code site:"].select("a")]
if "Code site:" in sites
else []
),
used_in=(
[link.attrs["href"] for link in sites["Used in:"].select("a")]
if "Used in:" in sites
else []
),
described_in=(
[link.attrs["href"] for link in sites["Described in:"].select("a")]
if "Described in:" in sites
else []
),
bibcode=bibcode.text if bibcode else None,
preferred_citation_method=str(cite_method) if cite_method else None,
discuss_url=u(item.select_one("div.discuss > a")).attrs["href"],
views=int(u(item.select_one("div.views")).text[7:]),
)
53 changes: 52 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ types-setuptools = "^57.4.10"
types-toml = "^0.10.4"
types-termcolor = "^1.1.3"
# rest
types-beautifulsoup4 = "^4.10.14"
types-requests = "^2.27.12"
tqdm-stubs = "^0.1.2"

[tool.poetry.dependencies]
python = "^3.8"
Expand Down
2 changes: 1 addition & 1 deletion script.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
cast,
)

#import autoimport
# import autoimport
import isort
import setuptools
import toml
Expand Down
4 changes: 2 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ascl_net_scraper import __version__
from tqdm import tqdm
from ascl_net_scraper import __version__, scrape_details, scrape_index_list


def test_main() -> None:
Expand All @@ -10,4 +11,3 @@ def test_main() -> None:
assert record.credit == detailed_record.credit
assert record.abstract == detailed_record.abstract
assert record.details_url == detailed_record.url

0 comments on commit b03ef32

Please sign in to comment.