Add github

charmoniumQ · Mar 17, 2022 · 16e9859 · 16e9859
1 parent 7d7a3ec
commit 16e9859
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 64 deletions.
diff --git a/README.rst b/README.rst
@@ -49,3 +49,8 @@ guide`_.
     $ pip install ascl_net_scraper
 
 >>> import ascl_net_scraper
+>>> codes = ascl_net_scraper.scrape_index(5)
+>>> codes[0]
+CodeRecord(ascl_id=None, title='2-DUST: Dust radiative transfer code', credit=['Ueta, Toshiya'], abstract='<p>...</p>', details_url='https://ascl.net/1604.006')
+>>> codes[0].get_details()
+DetailedCodeRecord(ascl_id=None, title='2-DUST: Dust radiative transfer code', credit=['Ueta, Toshiya'], abstract='<p>...</p>', url='https://ascl.net/1604.006', code_sites=['https://github.com/sundarjhu/2-DUST/'], used_in=['https://ui.adsabs.harvard.edu/abs/2004ApJ...614..371M'], described_in=['https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U'], bibcode='2016ascl.soft04006U', preferred_citation_method='<p><a href="https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U">https://ui.adsabs.harvard.edu/abs/2003ApJ...586.1338U</a></p>', discuss_url='/phpBB3/viewtopic.php?t=33976', views=...)
diff --git a/ascl_net_scraper/__init__.py b/ascl_net_scraper/__init__.py
@@ -1,6 +1,5 @@
 __version__ = "0.2.0"
-from .main import CodeRecord as CodeRecord
-from .main import DetailedCodeRecord as DetailedCodeRecord
-from .main import scrape_details as scrape_details
-from .main import scrape_index_lazy as scrape_index_lazy
-from .main import scrape_index_list as scrape_index_list
+from .lib import CodeRecord as CodeRecord
+from .lib import DetailedCodeRecord as DetailedCodeRecord
+from .lib import scrape_details as scrape_details
+from .lib import scrape_index as scrape_index
diff --git a/ascl_net_scraper/__main__.py b/ascl_net_scraper/__main__.py
@@ -1,28 +1,7 @@
-import logging
-import os
-
-from tqdm import tqdm
-
-from .main import scrape_index_list, scrape_details
+from .lib import scrape_index
 
 # __name__ == "__main__" is needed so pytest ignores this.
 if __name__ == "__main__":
-    logger = logging.getLogger("charmonium.freeze")
-    logger.setLevel(logging.DEBUG)
-    fh = logging.FileHandler("freeze.log")
-    fh.setLevel(logging.DEBUG)
-    fh.setFormatter(logging.Formatter("%(message)s"))
-    logger.addHandler(fh)
-    logger.debug("Program {}", os.getpid())
-
-    logger = logging.getLogger("charmonium.cache.ops")
-    logger.setLevel(logging.DEBUG)
-    fh = logging.FileHandler("cache.log")
-    fh.setLevel(logging.DEBUG)
-    fh.setFormatter(logging.Formatter("%(message)s"))
-    logger.addHandler(fh)
-    logger.debug("Program {}", os.getpid())
-
-    records = scrape_index_list(10)
-    for record in tqdm(records, total=len(records)):
-        detailed_record = scrape_details(record.details_url)
+    results = scrape_index(100)
+    for result in results:
+        result.get_details()
diff --git a/ascl_net_scraper/main.py → ascl_net_scraper/lib.py b/ascl_net_scraper/main.py → ascl_net_scraper/lib.py
@@ -2,32 +2,30 @@
 
 import re
 from dataclasses import dataclass
-from typing import (
-    Iterable,
-    List,
-    Mapping,
-    Optional,
-    Tuple,
-    TypeVar,
-    cast,
-)
+from typing import Iterable, List, Mapping, Optional, Tuple, TypeVar, cast
 
 import bs4
 import requests
 from charmonium.cache import MemoizedGroup, memoize
 from tqdm import tqdm
 
 DEFAULT_PARSER = "html5lib"
-group = MemoizedGroup(fine_grain_persistence=True)
+group = MemoizedGroup(fine_grain_persistence=True, size="10MiB")
+
 
 @dataclass
 class CodeRecord:
+    """Information about a code representing one entry in <https://ascl.net/code/all>."""
+
     ascl_id: Optional[Tuple[int, int]]
     title: str
     credit: List[str]
     abstract: str
     details_url: str
 
+    def get_details(self) -> DetailedCodeRecord:
+        return cast(DetailedCodeRecord, scrape_details(self.details_url))
+
 
 def parse_ascl_id(ascl_id_str: str) -> Optional[Tuple[int, int]]:
     if m := re.match(r"\[ascl:(\d+).(\d+)\]", ascl_id_str):
@@ -46,23 +44,27 @@ def unwrap(obj: Optional[_T]) -> _T:
 
 
 @memoize(group=group)
-def scrape_index_list(
+def scrape_index(
     max_count: Optional[int] = None,
     verbose: bool = True,
-    parser: str = DEFAULT_PARSER,
 ) -> List[CodeRecord]:
-    return list(scrape_index_lazy(max_count, verbose, parser))
+    """Get `max_count` entries from <https://ascl.net/code/all>.
+
+    Pass `None` to get all entries.
+
+    This function caches the result, so if it is called with the same `max_count`, the result can be loaded from disk.
+    """
+    return list(scrape_index_lazy(max_count, verbose))
 
 
 def scrape_index_lazy(
     max_count: Optional[int] = None,
     verbose: bool = True,
-    parser: str = DEFAULT_PARSER,
 ) -> Iterable[CodeRecord]:
     max_count = max_count if max_count is not None else 300000
     # 300000 ~ 100 * current value on 2022-03-10
     response = requests.get(f"https://ascl.net/code/all/limit/{max_count}")
-    soup = bs4.BeautifulSoup(response.text, parser)
+    soup = bs4.BeautifulSoup(response.text, DEFAULT_PARSER)
     items = list(soup.select("div.codelist div.item"))
     u = unwrap
     item: bs4.Tag
@@ -75,46 +77,78 @@ def scrape_index_lazy(
                 child.text for child in u(item.select_one("div.credit a")).children
             ],
             abstract=str(u(item.select_one("p"))),
-            details_url="https://ascl.net/"
-            + u(item.select_one("span.title a")).attrs["href"],
+            details_url=(
+                "https://ascl.net/" + u(item.select_one("span.title a")).attrs["href"]
+            ).replace(".net//", ".net/"),
         )
 
 
+github_regex = re.compile(r"(https?://github.com/[a-zA-Z0-9\.\-]*/[a-zA-Z0-9\.\-]*)")
+
+
 @dataclass
 class DetailedCodeRecord:
+    """Detailed information about a code, for example <https://ascl.net/0000.000>."""
+
     ascl_id: Optional[Tuple[int, int]]
     title: str
     credit: List[str]
     abstract: str
     url: str
-    code_site: List[str]
+    code_sites: List[str]
     used_in: List[str]
     described_in: List[str]
     bibcode: Optional[str]
     preferred_citation_method: Optional[str]
     discuss_url: str
     views: int
 
-    @staticmethod
-    def from_code_record(code_record: CodeRecord) -> DetailedCodeRecord:
-        return cast(DetailedCodeRecord, scrape_details(code_record.details_url))
+    @property
+    def github(self) -> Optional[str]:
+        return cast(Optional[str], get_github_for(self))
 
+@memoize(group=group)
+def get_github_for(record: DetailedCodeRecord) -> Optional[str]:
+    # First, see if any code_site is a github site.
+    for site in record.code_sites:
+        if re.match(github_regex, site):
+            return site
+
+    # Second, see if any code_site is part of a github site.
+    # This includes https://github.com/author/ (no repo)
+    # and https://github.com/author/repo/blob/main/path (subpath in repo)
+    for site in record.code_sites:
+        if re.match(r"https?://github.com/[a-zA-Z0-9\.\-\/]*", site):
+            return site
+
+    # Second, see if any code_site links to a github site.
+    for site in record.code_sites:
+        try:
+            # A lot of old sites take forever to time out.
+            text = requests.get(site, timeout=4).text
+        except requests.exceptions.RequestException:
+            # A lot of old sites are dead.
+            continue
+        if match := re.search(github_regex, text):
+            return match.group(0)
+
+    # Third, give up.
+    return None
 
 def dl_to_dict(dl: bs4.Tag) -> Mapping[str, bs4.Tag]:
-    children = list(dl.children)
     return {
         key.text: cast(bs4.Tag, val)
-        for key, val in zip(children[::2], children[1::2])
+        for key, val in zip(dl.find_all("dt"), dl.find_all("dd"))
     }
 
 
 @memoize(group=group)
 def scrape_details(
     url: str,
-    parser: str = DEFAULT_PARSER,
 ) -> DetailedCodeRecord:
+    """Get detailed information about a code."""
     response = requests.get(url)
-    soup = bs4.BeautifulSoup(response.text, parser)
+    soup = bs4.BeautifulSoup(response.text, DEFAULT_PARSER)
     item = soup.select_one("div.codelist div.item")
     assert item
     sites_soup = item.select_one("dl.sites")
@@ -128,7 +162,7 @@ def scrape_details(
         credit=[child.text for child in u(item.select_one("div.credit a")).children],
         abstract=str(item.select_one("p")),
         url=url,
-        code_site=(
+        code_sites=(
             [link.attrs["href"] for link in sites["Code site:"].select("a")]
             if "Code site:" in sites
             else []
@@ -144,7 +178,9 @@ def scrape_details(
             else []
         ),
         bibcode=bibcode.text if bibcode else None,
-        preferred_citation_method=str(cite_method) if cite_method else None,
+        preferred_citation_method=str(cite_method.select_one("p"))
+        if cite_method
+        else None,
         discuss_url=u(item.select_one("div.discuss > a")).attrs["href"],
         views=int(u(item.select_one("div.views")).text[7:]),
     )
diff --git a/script.py b/script.py
@@ -26,7 +26,7 @@
     cast,
 )
 
-#import autoimport
+# import autoimport
 import isort
 import setuptools
 import toml
@@ -338,10 +338,10 @@ def inner() -> Iterable[List[str]]:
 
 @app.command()
 def publish(
-        version_part: VersionPart,
-        verify: bool = True,
-        docs: bool = True,
-        bump: bool = True,
+    version_part: VersionPart,
+    verify: bool = True,
+    docs: bool = True,
+    bump: bool = True,
 ) -> None:
     if verify:
         asyncio.run(all_tests_inner(True))

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,13 +1,16 @@
 from tqdm import tqdm
-from ascl_net_scraper import __version__, scrape_details, scrape_index_list
+import shutil
+shutil.rmtree(".cache")
+from ascl_net_scraper import __version__, scrape_details, scrape_index
 
 
 def test_main() -> None:
-    records = scrape_index_list(10)
+    records = scrape_index(10)
     for record in tqdm(records, total=len(records)):
         detailed_record = scrape_details(record.details_url)
         assert record.ascl_id == detailed_record.ascl_id
         assert record.title == detailed_record.title
         assert record.credit == detailed_record.credit
         assert record.abstract == detailed_record.abstract
         assert record.details_url == detailed_record.url
+        detailed_record.github