Skip to content

Commit

Permalink
allow refresh of metadata stored in rogue scholar
Browse files Browse the repository at this point in the history
  • Loading branch information
mfenner committed Mar 29, 2024
1 parent 535fbd5 commit 33a101c
Show file tree
Hide file tree
Showing 13 changed files with 13,680 additions and 120 deletions.
27 changes: 9 additions & 18 deletions api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
format_relationships,
translate_titles,
)
from api.posts import extract_all_posts, extract_all_posts_by_blog, update_posts
from api.posts import extract_all_posts, extract_all_posts_by_blog, update_all_posts, update_all_posts_by_blog
from api.blogs import extract_single_blog, extract_all_blogs
from api.works import SUPPORTED_ACCEPT_HEADERS, get_formatted_work
from api.schema import Blog, Post, Work, PostQuery
Expand Down Expand Up @@ -277,9 +277,12 @@ async def post_blog_posts(slug: str, suffix: Optional[str] = None):
return {"error": "Unauthorized."}, 401
elif slug and suffix == "posts":
try:
result = await extract_all_posts_by_blog(
slug, page=page, offset=offset, update_all=(update == "all")
)
if update == "self":
result = await update_all_posts_by_blog(slug, page=page)
else:
result = await extract_all_posts_by_blog(
slug, page=page, offset=offset, update_all=(update == "all")
)
if isinstance(result, dict) and result.get("error", None):
return result, 400
return jsonify(result)
Expand Down Expand Up @@ -367,7 +370,6 @@ async def post_posts():

page = int(request.args.get("page") or "1")
update = request.args.get("update")
content_text = request.args.get("content_text")

if (
request.headers.get("Authorization", None) is None
Expand All @@ -377,19 +379,8 @@ async def post_posts():
return {"error": "Unauthorized."}, 401
else:
try:
if content_text == "content_text":
response = typesense.collections["posts"].documents.search(
{
"q": "",
"query_by": "content_text",
"sort_by": "published_at:desc",
"per_page": 50,
"page": page if page and page > 0 else 1,
"filter_by": "content_text:content_text",
"include_fields": "id,doi,content_text",
}
)
updated_posts = await update_posts(response.get("hits", []))
if update == "self":
updated_posts = await update_all_posts(page=page)
return jsonify(updated_posts)
else:
extracted_posts = await extract_all_posts(
Expand Down
196 changes: 172 additions & 24 deletions api/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
compact,
fix_xml,
get_markdown,
write_html,
EXCLUDED_TAGS,
)
from api.works import get_single_work
from api.supabase import (
supabase_admin_client as supabase_admin,
supabase_client as supabase,
postsWithContentSelect,
)
from api.typesense import typesense_client as typesense

Expand Down Expand Up @@ -65,23 +67,30 @@ async def extract_all_posts(page: int = 1, update_all: bool = False):
return results


async def update_posts(posts: list):
"""Update posts."""
async def update_all_posts(page: int = 1):
"""Update all posts."""

try:
blogs = (
supabase.table("blogs")
.select("slug")
.in_("status", ["active", "archived", "pending"])
.order("title", desc=False)
.execute()
)
tasks = []
for blog in blogs.data:
task = update_all_posts_by_blog(blog["slug"], page)
tasks.append(task)

def update_post(post):
id_ = py_.get(post, "document.id")
if len(id_) == 5:
print(id_, py_.get(post, "document.doi", None))
typesense.collections["posts"].documents[id_].delete()
return {}
return py_.get(post, "document.content_text", "")
raw_results = await asyncio.gather(*tasks)

return [update_post(x) for x in posts]
except Exception:
print(traceback.format_exc())
return {}
# flatten list of lists
results = []
for result in raw_results:
if result:
results.append(result[0])

return results


async def extract_all_posts_by_blog(
Expand Down Expand Up @@ -290,6 +299,47 @@ async def extract_all_posts_by_blog(
return []


async def update_all_posts_by_blog(slug: str, page: int = 1):
"""Update all posts by blog."""

try:
response = (
supabase.table("blogs")
.select(
"id, slug, feed_url, current_feed_url, home_page_url, archive_prefix, feed_format, created_at, updated_at, mastodon, generator, generator_raw, language, category, favicon, title, description, category, status, user_id, authors, plan, use_api, relative_url, filter, secure"
)
.eq("slug", slug)
.maybe_single()
.execute()
)
blog = response.data
if not blog:
return {}

start_page = (page - 1) * 50 if page > 0 else 0
end_page = (page - 1) * 50 + 50 if page > 0 else 50
blog_with_posts = {}

response = (
supabase.table("posts")
.select(postsWithContentSelect)
.eq("blog_slug", blog["slug"])
.order("published_at", desc=True)
.range(start_page, end_page)
.execute()
)
update_posts = [update_rogue_scholar_post(x, blog) for x in response.data]
blog_with_posts["entries"] = await asyncio.gather(*update_posts)
return [upsert_single_post(i) for i in blog_with_posts["entries"]]
except TimeoutError:
print(f"Timeout error in blog {blog['slug']}.")
return []
except Exception as e:
print(f"{e} error in blog {blog['slug']}.")
print(traceback.format_exc())
return []


async def extract_wordpress_post(post, blog):
"""Extract WordPress post from REST API."""

Expand Down Expand Up @@ -897,6 +947,85 @@ def format_author(author, published_at):
return {}


async def update_rogue_scholar_post(post, blog):
"""Update Rogue Scholar post."""
try:
print(post)

def format_author(author, published_at):
"""Format author. Optionally lookup real name from username,
and ORCID from name."""

return normalize_author(
author.get("name", None), published_at, author.get("url", None)
)

published_at = post.get("published_at")
content_text = post.get("content_text")
content_html = write_html(content_text)

# use default author for blog if no post author found and no author header in content
authors_ = get_contributors(content_html) or wrap(post.get("authors", None))
if len(authors_) == 0 or authors_[0].get("name", None) is None:
authors_ = wrap(blog.get("authors", None))
authors = [format_author(i, published_at) for i in authors_ if i]

summary = get_summary(content_html)
abstract = post.get("abstract", None)
abstract = get_abstract(summary, abstract)
reference = await get_references(content_html)
relationships = get_relationships(content_html)
title = get_title(post.get("title"))
url = normalize_url(post.get("url"), secure=blog.get("secure", True))
archive_url = (
blog["archive_prefix"] + url if blog.get("archive_prefix", None) else None
)
images = get_images(content_html, url, blog["home_page_url"])
image = post.get("image", None) or get_image(images)

# optionally remove tag that is used to filter posts
if blog.get("filter", None) and blog.get("filter", "").startswith("tag"):
tag = blog.get("filter", "").split(":")[1]
tags = [
normalize_tag(i)
for i in wrap(post.get("tags", None))
if i != tag and i not in EXCLUDED_TAGS
]
else:
tags = [
normalize_tag(i)
for i in wrap(post.get("tags", None))
if i not in EXCLUDED_TAGS
]
tags = py_.uniq(tags)[:5]

return {
"authors": authors,
"blog_name": blog.get("title"),
"blog_slug": blog.get("slug"),
"content_text": content_text,
"summary": summary,
"abstract": abstract,
"published_at": published_at,
"updated_at": post.get("updated_at"),
"image": image,
"images": images,
"language": detect_language(content_text),
"category": blog.get("category", None),
"reference": reference,
"relationships": relationships,
"tags": tags,
"title": title,
"url": url,
"archive_url": archive_url,
"guid": post.get("guid"),
"status": blog.get("status"),
}
except Exception:
print(blog.get("slug", None), traceback.format_exc())
return {}


def filter_updated_posts(posts, blog, key):
"""Filter posts by date updated."""

Expand Down Expand Up @@ -1000,7 +1129,7 @@ def get_name(string):
if not string:
return None
m = re.search(r"\w+\s\w+", string)
return m.group(0)
return m.group(0) if m else None

def get_url(string):
"""Get url from string."""
Expand All @@ -1011,19 +1140,36 @@ def get_url(string):
return None
return f.url

def get_contributor(contributor):
"""Get contributor."""
if not contributor:
return None
name = get_name(contributor.text)
url = get_url(contributor.find("a", href=True))
if not name or not url:
return None
return {"name": name, "url": url}

soup = get_soup(content_html)

# find author header and extract name and optional orcid
headers = soup.find_all(["h2", "h3", "h4"])
contributor = next(
(header.next_sibling for header in headers if "Author" in header.text),
headers = soup.find_all(["h1", "h2", "h3", "h4"])
author_header = next(
(i for i in headers if "Author" in i.text),
None,
)
if not contributor:
if not author_header:
return None
name = get_name(contributor.text)
url = get_url(contributor.find("a", href=True))
return {"name": name, "url": url}
author_string = author_header.find_next_sibling(["p", "ul", "ol"])
contributors = []

# support for multiple authors
if author_string.name in ["ul", "ol"]:
for li in author_string.find_all("li"):
contributors.append(li)
else:
contributors.append(author_string)
return [get_contributor(contributor) for contributor in contributors]


async def get_references(content_html: str):
Expand Down Expand Up @@ -1144,9 +1290,11 @@ def get_summary(content_html: str = None, maxlen: int = 450):
return string.strip()


def get_abstract(summary: str, abstract: str):
def get_abstract(summary: str, abstract: Optional[str]):
"""Get abstract if not beginning of post.
Use Levenshtein distance to compare summary and abstract."""
if abstract is None:
return None
le = min(len(abstract), 100)
rat = ratio(summary[:le], abstract[:le])
return abstract if rat <= 0.75 else None
Expand Down
40 changes: 39 additions & 1 deletion api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,34 @@
"id": "https://ror.org/019wvm592",
},
],
"https://orcid.org/0009-0003-3823-6609": [
{
"name": "Australian National University",
"id": "https://ror.org/019wvm592",
"start_date": "2023-07-01",
},
],
"https://orcid.org/0009-0003-3823-6609": [
{
"name": "Australian National University",
"id": "https://ror.org/019wvm592",
"start_date": "2023-07-01",
},
],
"https://orcid.org/0009-0009-9720-9233": [
{
"name": "Swinburne University of Technology",
"id": "https://ror.org/031rekg67",
"start_date": "2023-06-16",
},
],
"https://orcid.org/0009-0008-8672-3168": [
{
"name": "Swinburne University of Technology",
"id": "https://ror.org/031rekg67",
"start_date": "2024-01-01",
},
],
}


Expand Down Expand Up @@ -863,6 +891,16 @@ def get_markdown(content_html: str) -> str:
return ""


def write_html(markdown: str):
"""Get html from markdown"""
try:
doc = pandoc.read(markdown, format="commonmark_x")
return pandoc.write(doc, format="html")
except Exception as e:
print(e)
return ""


def write_epub(markdown: str):
"""Get epub from markdown"""
try:
Expand Down Expand Up @@ -903,7 +941,7 @@ def write_jats(markdown: str):
return ""


def format_markdown(content: str, metadata) -> str:
def format_markdown(content: str, metadata) -> frontmatter.Post:
"""format markdown"""
post = frontmatter.Post(content, **metadata)
post["date"] = datetime.fromtimestamp(metadata.get("date", 0), tz=timezone.utc).isoformat("T", "seconds")
Expand Down
Loading

0 comments on commit 33a101c

Please sign in to comment.