Skip to content

Commit

Permalink
fix filtering of rss feeds
Browse files Browse the repository at this point in the history
  • Loading branch information
mfenner committed Apr 15, 2024
1 parent da1cdfd commit 89f5966
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 11 deletions.
33 changes: 23 additions & 10 deletions api/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import time
import traceback
from urllib.parse import unquote
from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix
from commonmeta import validate_doi, normalize_id, validate_url, validate_prefix, doi_from_url
from Levenshtein import ratio

from api.utils import (
Expand All @@ -31,6 +31,7 @@
get_markdown,
write_html,
validate_uuid,
id_as_str,
EXCLUDED_TAGS,
)
from api.works import get_single_work
Expand Down Expand Up @@ -266,6 +267,8 @@ async def extract_all_posts_by_blog(
posts = py_.get(json, "feed.entry", [])
if not update_all:
posts = filter_updated_posts(posts, blog, key="published")
if blog.get("filter", None):
posts = filter_posts(posts, blog, key="category")
posts = posts[start_page:end_page]
extract_posts = [extract_atom_post(x, blog) for x in posts]
blog_with_posts["entries"] = await asyncio.gather(*extract_posts)
Expand All @@ -281,7 +284,9 @@ async def extract_all_posts_by_blog(
if not update_all:
posts = filter_updated_posts(posts, blog, key="pubDate")
if blog.get("filter", None):
print(f"Unfiltered posts: {len(posts)}")
posts = filter_posts(posts, blog, key="category")
print(f"Filtered posts: {len(posts)}")
posts = posts[start_page:end_page]
extract_posts = [extract_rss_post(x, blog) for x in posts]
blog_with_posts["entries"] = await asyncio.gather(*extract_posts)
Expand Down Expand Up @@ -1030,9 +1035,11 @@ def format_author(author, published_at):
]
tags = py_.uniq(tags)[:5]

# upsert post into works table if it has a DOI
# upsert post with commonmeta if it has a DOI
if post.get("doi", None):
work = await get_single_work(post.get("doi"))
id_ = id_as_str(post.get("doi"))
work = await get_single_work(id_)
print(work)

return {
"authors": authors,
Expand Down Expand Up @@ -1074,12 +1081,18 @@ def parse_date(date):

def filter_posts(posts, blog, key):
"""Filter posts if filter is set in blog settings. Used for RSS and Atom feeds."""
filters = blog.get("filter", "").split(":")
if len(filters) != 2 or filters[0] != key:
return posts
filters = filters[1].split(",")

return [x for x in posts if x.get(key, None) in filters]
def match_filter(post):
"""Match filter."""
filters = blog.get("filter", "").split(":")
if len(filters) != 2 or filters[0] != key:
return True
filters = filters[1].split(",")
if isinstance(post.get(key, None), str):
return post.get(key, None) in filters
m = set(post.get(key, None)).intersection(filters)
return len(m) > 0

return [x for x in posts if match_filter(x)]


def upsert_single_post(post):
Expand Down Expand Up @@ -1242,7 +1255,7 @@ async def format_reference(id_, index):
"""Format reference."""
id_ = normalize_id(id_)
if validate_url(id_) in ["DOI", "URL"]:
work = await get_single_work(id_)
work = await get_single_work(id_as_str(id_))
if not work:
return None
identifier = py_.get(work, "id", None)
Expand Down
13 changes: 13 additions & 0 deletions api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from commonmeta.constants import Commonmeta
from commonmeta.date_utils import get_date_from_unix_timestamp
from commonmeta.doi_utils import validate_prefix, get_doi_ra
from idutils import is_doi
import frontmatter
import pandoc
# from pandoc.types import Str
Expand Down Expand Up @@ -1143,3 +1144,15 @@ def translate_titles(markdown):
markdown["citation-title"] = citation_title.get(lang, "Citation")
markdown["copyright-title"] = copyright_title.get(lang, "Copyright")
return markdown


def id_as_str(id: str) -> Optional[str]:
"""Get id as string, strip scheme and doi.org host"""
if id is None:
return None
u = furl(id)
if u.host == "doi.org":
return str(u.path).lstrip("/")
if u.host != "":
return u.host + str(u.path)
return None
1 change: 0 additions & 1 deletion api/works.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional
import httpx


# supported accept headers for content negotiation
SUPPORTED_ACCEPT_HEADERS = [
"application/vnd.commonmeta+json",
Expand Down
7 changes: 7 additions & 0 deletions tests/test-utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
write_html,
format_markdown,
is_valid_url,
id_as_str,
)


Expand Down Expand Up @@ -450,6 +451,12 @@ def test_format_html():
assert result == "<p>This is a <em>test</em></p>\n"


def test_id_as_str():
"""id as string"""
assert "10.5555/1234" == id_as_str("https://doi.org/10.5555/1234")
assert "www.gooogle.com/blabla" == id_as_str("https://www.gooogle.com/blabla")


# def test_sanitize_cool_suffix():
# "sanitize cool suffix"
# suffix = "sfzv4-xdb68"
Expand Down

0 comments on commit 89f5966

Please sign in to comment.