diff --git a/api/posts.py b/api/posts.py index f4a154a..55e2d7f 100644 --- a/api/posts.py +++ b/api/posts.py @@ -13,7 +13,7 @@ import time import traceback from urllib.parse import unquote -from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix +from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix, doi_from_url from Levenshtein import ratio from api.utils import ( @@ -31,6 +31,7 @@ get_markdown, write_html, validate_uuid, + id_as_str, EXCLUDED_TAGS, ) from api.works import get_single_work @@ -266,6 +267,8 @@ async def extract_all_posts_by_blog( posts = py_.get(json, "feed.entry", []) if not update_all: posts = filter_updated_posts(posts, blog, key="published") + if blog.get("filter", None): + posts = filter_posts(posts, blog, key="category") posts = posts[start_page:end_page] extract_posts = [extract_atom_post(x, blog) for x in posts] blog_with_posts["entries"] = await asyncio.gather(*extract_posts) @@ -281,7 +284,9 @@ async def extract_all_posts_by_blog( if not update_all: posts = filter_updated_posts(posts, blog, key="pubDate") if blog.get("filter", None): + print(f"Unfiltered posts: {len(posts)}") posts = filter_posts(posts, blog, key="category") + print(f"Filtered posts: {len(posts)}") posts = posts[start_page:end_page] extract_posts = [extract_rss_post(x, blog) for x in posts] blog_with_posts["entries"] = await asyncio.gather(*extract_posts) @@ -1030,9 +1035,11 @@ def format_author(author, published_at): ] tags = py_.uniq(tags)[:5] - # upsert post into works table if it has a DOI + # upsert post with commonmeta if it has a DOI if post.get("doi", None): - work = await get_single_work(post.get("doi")) + id_ = id_as_str(post.get("doi")) + work = await get_single_work(id_) + print(work) return { "authors": authors, @@ -1074,12 +1081,18 @@ def parse_date(date): def filter_posts(posts, blog, key): """Filter posts if filter is set in blog settings. Used for RSS and Atom feeds.""" - filters = blog.get("filter", "").split(":") - if len(filters) != 2 or filters[0] != key: - return posts - filters = filters[1].split(",") - - return [x for x in posts if x.get(key, None) in filters] + def match_filter(post): + """Match filter.""" + filters = blog.get("filter", "").split(":") + if len(filters) != 2 or filters[0] != key: + return True + filters = filters[1].split(",") + if isinstance(post.get(key, None), str): + return post.get(key, None) in filters + m = set(post.get(key, None)).intersection(filters) + return len(m) > 0 + + return [x for x in posts if match_filter(x)] def upsert_single_post(post): @@ -1242,7 +1255,7 @@ async def format_reference(id_, index): """Format reference.""" id_ = normalize_id(id_) if validate_url(id_) in ["DOI", "URL"]: - work = await get_single_work(id_) + work = await get_single_work(id_as_str(id_)) if not work: return None identifier = py_.get(work, "id", None) diff --git a/api/utils.py b/api/utils.py index 9276f3a..2dd15e3 100644 --- a/api/utils.py +++ b/api/utils.py @@ -23,6 +23,7 @@ from commonmeta.constants import Commonmeta from commonmeta.date_utils import get_date_from_unix_timestamp from commonmeta.doi_utils import validate_prefix, get_doi_ra +from idutils import is_doi import frontmatter import pandoc # from pandoc.types import Str @@ -1143,3 +1144,15 @@ def translate_titles(markdown): markdown["citation-title"] = citation_title.get(lang, "Citation") markdown["copyright-title"] = copyright_title.get(lang, "Copyright") return markdown + + +def id_as_str(id: str) -> Optional[str]: + """Get id as string, strip scheme and doi.org host""" + if id is None: + return None + u = furl(id) + if u.host == "doi.org": + return str(u.path).lstrip("/") + if u.host != "": + return u.host + str(u.path) + return None diff --git a/api/works.py b/api/works.py index 4df57b0..004b1e4 100644 --- a/api/works.py +++ b/api/works.py @@ -1,7 +1,6 @@ from typing import Optional import httpx - # supported accept headers for content negotiation SUPPORTED_ACCEPT_HEADERS = [ "application/vnd.commonmeta+json", diff --git a/tests/test-utils.py b/tests/test-utils.py index caeccb2..c62cd36 100644 --- a/tests/test-utils.py +++ b/tests/test-utils.py @@ -24,6 +24,7 @@ write_html, format_markdown, is_valid_url, + id_as_str, ) @@ -450,6 +451,12 @@ def test_format_html(): assert result == "

This is a test

\n" +def test_id_as_str(): + """id as string""" + assert "10.5555/1234" == id_as_str("https://doi.org/10.5555/1234") + assert "www.gooogle.com/blabla" == id_as_str("https://www.gooogle.com/blabla") + + # def test_sanitize_cool_suffix(): # "sanitize cool suffix" # suffix = "sfzv4-xdb68"