fix filtering of rss feeds

front-matter · Apr 15, 2024 · 89f5966 · 89f5966
1 parent da1cdfd
commit 89f5966
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 11 deletions.
diff --git a/api/posts.py b/api/posts.py
@@ -13,7 +13,7 @@
 import time
 import traceback
 from urllib.parse import unquote
-from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix
+from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix, doi_from_url
 from Levenshtein import ratio
 
 from api.utils import (
@@ -31,6 +31,7 @@
     get_markdown,
     write_html,
     validate_uuid,
+    id_as_str,
     EXCLUDED_TAGS,
 )
 from api.works import get_single_work
@@ -266,6 +267,8 @@ async def extract_all_posts_by_blog(
                 posts = py_.get(json, "feed.entry", [])
                 if not update_all:
                     posts = filter_updated_posts(posts, blog, key="published")
+                if blog.get("filter", None):
+                    posts = filter_posts(posts, blog, key="category")
                 posts = posts[start_page:end_page]
             extract_posts = [extract_atom_post(x, blog) for x in posts]
             blog_with_posts["entries"] = await asyncio.gather(*extract_posts)
@@ -281,7 +284,9 @@ async def extract_all_posts_by_blog(
                 if not update_all:
                     posts = filter_updated_posts(posts, blog, key="pubDate")
                 if blog.get("filter", None):
+                    print(f"Unfiltered posts: {len(posts)}")
                     posts = filter_posts(posts, blog, key="category")
+                    print(f"Filtered posts: {len(posts)}")
                 posts = posts[start_page:end_page]
             extract_posts = [extract_rss_post(x, blog) for x in posts]
             blog_with_posts["entries"] = await asyncio.gather(*extract_posts)
@@ -1030,9 +1035,11 @@ def format_author(author, published_at):
             ]
         tags = py_.uniq(tags)[:5]
 
-        # upsert post into works table if it has a DOI
+        # upsert post with commonmeta if it has a DOI
         if post.get("doi", None):
-            work = await get_single_work(post.get("doi"))
+            id_ = id_as_str(post.get("doi"))
+            work = await get_single_work(id_)
+            print(work)
 
         return {
             "authors": authors,
@@ -1074,12 +1081,18 @@ def parse_date(date):
 
 def filter_posts(posts, blog, key):
     """Filter posts if filter is set in blog settings. Used for RSS and Atom feeds."""
-    filters = blog.get("filter", "").split(":")
-    if len(filters) != 2 or filters[0] != key:
-        return posts
-    filters = filters[1].split(",")
-
-    return [x for x in posts if x.get(key, None) in filters]
+    def match_filter(post):
+        """Match filter."""
+        filters = blog.get("filter", "").split(":")
+        if len(filters) != 2 or filters[0] != key:
+            return True
+        filters = filters[1].split(",")
+        if isinstance(post.get(key, None), str):
+            return post.get(key, None) in filters
+        m = set(post.get(key, None)).intersection(filters)
+        return len(m) > 0
+
+    return [x for x in posts if match_filter(x)]
 
 
 def upsert_single_post(post):
@@ -1242,7 +1255,7 @@ async def format_reference(id_, index):
     """Format reference."""
     id_ = normalize_id(id_)
     if validate_url(id_) in ["DOI", "URL"]:
-        work = await get_single_work(id_)
+        work = await get_single_work(id_as_str(id_))
         if not work:
             return None
         identifier = py_.get(work, "id", None)

diff --git a/api/utils.py b/api/utils.py
@@ -23,6 +23,7 @@
 from commonmeta.constants import Commonmeta
 from commonmeta.date_utils import get_date_from_unix_timestamp
 from commonmeta.doi_utils import validate_prefix, get_doi_ra
+from idutils import is_doi
 import frontmatter
 import pandoc
 # from pandoc.types import Str
@@ -1143,3 +1144,15 @@ def translate_titles(markdown):
     markdown["citation-title"] = citation_title.get(lang, "Citation")
     markdown["copyright-title"] = copyright_title.get(lang, "Copyright")
     return markdown
+
+
+def id_as_str(id: str) -> Optional[str]:
+    """Get id as string, strip scheme and doi.org host"""
+    if id is None:
+        return None
+    u = furl(id)
+    if u.host == "doi.org":
+        return str(u.path).lstrip("/")
+    if u.host != "":
+        return u.host + str(u.path)
+    return None
diff --git a/api/works.py b/api/works.py
@@ -1,7 +1,6 @@
 from typing import Optional
 import httpx
 
-
 # supported accept headers for content negotiation
 SUPPORTED_ACCEPT_HEADERS = [
     "application/vnd.commonmeta+json",

diff --git a/tests/test-utils.py b/tests/test-utils.py
@@ -24,6 +24,7 @@
     write_html,
     format_markdown,
     is_valid_url,
+    id_as_str,
 )
 
 
@@ -450,6 +451,12 @@ def test_format_html():
     assert result == "<p>This is a <em>test</em></p>\n"
 
 
+def test_id_as_str():
+    """id as string"""
+    assert "10.5555/1234" == id_as_str("https://doi.org/10.5555/1234")
+    assert "www.gooogle.com/blabla" == id_as_str("https://www.gooogle.com/blabla")
+
+
 # def test_sanitize_cool_suffix():
 #     "sanitize cool suffix"
 #     suffix = "sfzv4-xdb68"