From 759a129d4b9a63ec9f8af46efb156a6d11f0b102 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:24:47 +0200 Subject: [PATCH 01/11] Add a script to check for new publications that should be indexed on the website --- .../workflows/check-for-new-publications.yml | 54 ++++ scripts/update_publications.py | 231 ++++++++++++++++++ scripts/venue_abbreviations.yml | 36 +++ 3 files changed, 321 insertions(+) create mode 100644 .github/workflows/check-for-new-publications.yml create mode 100644 scripts/update_publications.py create mode 100644 scripts/venue_abbreviations.yml diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml new file mode 100644 index 0000000..d08141a --- /dev/null +++ b/.github/workflows/check-for-new-publications.yml @@ -0,0 +1,54 @@ +name: Update Publications + +on: + schedule: + # Runs at 03:00 on the first day of each month + - cron: '0 3 1 * *' + workflow_dispatch: # allows manual trigger + +jobs: + update-publications: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests pyyaml unidecode + - name: Get last run date + run: | + last_run_iso=$(gh run list --workflow "Update Publications" --status success --limit 1 --json createdAt --jq '.[0].createdAt' 2>/dev/null || echo "") + if [ -z "$last_run_iso" ]; then + last_run_iso="2019-01-01T00:00:00Z" # fallback default + fi + last_run=$(date -u -d "$last_run_iso" +"%Y-%m-%d") + echo "LAST_RUN=$last_run" >> $GITHUB_ENV + - name: Create branch + run: | + branch_name="update-publications-$(date +'%Y%m%d')" + git checkout -b $branch_name + echo "branch_name=${branch_name}" >> $GITHUB_ENV + - name: Run publication update script + run: python scripts/update_publications.py + - name: Commit changes + run: | + git add content/publication + git commit -m "Automated update of publications" || echo "No changes to commit" + - name: Push branch + run: git push origin HEAD + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Create Pull Request + run: | + branch_name=$(git rev-parse --abbrev-ref HEAD) + gh pr create \ + --title "Update publications" \ + --body "Automated update of publications since ${LAST_RUN}." \ + --base main \ + --head $branch_name + diff --git a/scripts/update_publications.py b/scripts/update_publications.py new file mode 100644 index 0000000..2c9952c --- /dev/null +++ b/scripts/update_publications.py @@ -0,0 +1,231 @@ +import requests +import os +import re +import datetime +import yaml +from pathlib import Path +import unidecode + +# === Config === +PROJECT_NAME = "gysela" # change to your project keyword +AUTHOR_DIR = Path(__file__).parent.parent / "content" / "authors" +PUBLICATION_DIR = Path(__file__).parent.parent / "content" / "publication" +last_run=os.environ['LAST_RUN'] +#LAST_RUN_FILE = Path("last_run.txt") +VENUE_ABBREVIATIONS_FILE = Path("venue_abbreviations.yml") + +existing_slugs = {p.stem for p in PUBLICATION_DIR.iterdir() if p.is_dir()} + +# === Helpers === +def load_abbrev_map(): + if VENUE_ABBREVIATIONS_FILE.exists(): + with open(VENUE_ABBREVIATIONS_FILE) as f: + return yaml.safe_load(f) + return {} + +def load_key_authors(): + key_authors = [] + for md_file in Path(AUTHOR_DIR).rglob("*.md"): + with open(md_file, encoding="utf-8") as f: + content = f.read() + if content.startswith("---"): + front_matter = content.split("---", 2)[1] + data = yaml.safe_load(front_matter) + if "name" in data and "organizations" in data: + orgs = [o["name"] for o in data.get("organizations", []) if "name" in o] + key_authors.append({ + "name": " ".join(data["name"].split(" ")[1:]), + "organizations": orgs + }) + return key_authors + +def get_first_author_surname(authorships): + if authorships: + first_author = authorships[0]["author"]["display_name"] + surname = first_author.split()[-1] + return unidecode.unidecode(surname).lower() + return "unknown" + +def get_all_authors(authorships): + return " and ".join(a["author"]["display_name"] for a in authorships) if authorships else "Unknown" + +def author_matches(work_authorships, key_authors): + for a in work_authorships: + author_name = a["raw_author_name"] + institutions = [i["raw_affiliation_string"] for i in a["affiliations"]] + for ka in key_authors: + if (ka["name"].lower() in author_name.lower()) and \ + any(org.lower() in instit.lower() for org in ka["organizations"] for instit in institutions): + return True + return False + +def make_slug(meta, abbrev_map): + surname = get_first_author_surname(meta["authorships"]) + if meta["venue_full"] in abbrev_map: + venue = abbrev_map[meta["venue_full"]]['slug'] + else: + venue = meta["venue_full"] + year = str(meta["year"]) + slug_base = f"{surname}-{venue}-{year}" + slug = slug_base + i = 2 + while slug in existing_slugs: + slug = f'{slug_base}_{i}' + i+=1 + existing_slugs.add(slug) + return slug + +def extract_metadata(work, abbrev_map): + """Extract shared metadata for front_matter and bibtex.""" + title = work.get("title", "") + authorships = work.get("authorships", []) + authors_list = [a["author"]["display_name"] for a in authorships] + authors_bibtex = get_all_authors(authorships) + surname = get_first_author_surname(authorships) + venue_host = work.get("host_venue", {}).get("display_name") + venue_primary = work.get("primary_location", {}) + if venue_primary: + venue_primary = venue_primary.get("source", {}) + if venue_primary: + venue_primary = venue_primary.get("display_name") + venue_full = venue_primary or venue_host or "" + year = work.get("publication_year", "") + doi = work.get("doi") + url = f"https://doi.org/{doi}" if doi else None + pub_date = work.get("publication_date", "1900-01-01") + biblio = work.get("biblio", {}) + volume = biblio.get("volume") + issue = biblio.get("issue") + first_page = biblio.get("first_page") + last_page = biblio.get("last_page") + pages = f"{first_page}--{last_page}" if first_page and last_page else None + abstract = work.get("abstract_inverted_index") and " ".join(work["abstract_inverted_index"].keys()) or "" + return { + "title": title, + "authors_list": authors_list, + "authors_bibtex": authors_bibtex, + "authorships": authorships, + "venue_full": venue_full, + "year": year, + "doi": doi, + "url": url, + "pub_date": pub_date, + "volume": volume, + "issue": issue, + "pages": pages, + "surname": surname, + "abstract": abstract + } + +def to_bibtex(meta, slug, abbrev_map): + if meta["venue_full"] in abbrev_map: + venue = abbrev_map[meta["venue_full"]]['bibtex'] + else: + venue = meta["venue_full"] + fields = { + "title": meta["title"], + "author": meta["authors_bibtex"], + "journal": venue, + "year": meta["year"], + "volume": meta["volume"], + "number": meta["issue"], + "pages": meta["pages"], + "doi": meta["doi"], + "url": meta["url"] + } + lines = [f"@article{{{slug},"] + lines.extend(f" {k} = {{{v}}}," for k, v in fields.items() if v) + lines[-1] = lines[-1].rstrip(",") # drop trailing comma + lines.append("}") + return "\n".join(lines) + +def write_index_md(folder, meta): + front_matter = { + "title": meta["title"], + "subtitle": "", + "summary": "", + "authors": meta["authors_list"], + "tags": [], + "categories": [], + "date": meta["pub_date"], + "lastmod": datetime.datetime.now().isoformat(), + "featured": False, + "draft": False, + "image": {"caption": "", "focal_point": "", "preview_only": False}, + "projects": [], + "publishDate": datetime.datetime.now().isoformat(), + "publication_types": ["1"], + "abstract": meta["abstract"], + "publication": meta["venue_full"], + "doi": meta["doi"] or "" + } + index_md = "---\n" + yaml.dump(front_matter, sort_keys=False) + "---\n" + (folder / "_index.md").write_text(index_md, encoding="utf-8") + +# === Main === +def main(): + #last_run = load_last_run() + abbrev_map = load_abbrev_map() + key_authors = load_key_authors() + + found_doi = set() + + for PROJECT_NAME in ('gysela', 'gyselax', 'gyselalib'): + url = "https://api.openalex.org/works" + params = { + "search": PROJECT_NAME, + "filter": f"from_publication_date:{last_run}", + "per-page": 50 + } + response = requests.get(url, params=params) + response.raise_for_status() + data = response.json() + results = data.get("results", []) + print(f"Found {len(results)} results for {PROJECT_NAME} since {last_run}") + + for work in results: + if work.get("type") == "preprint": + continue + meta = extract_metadata(work, abbrev_map) + if "arxiv" in meta["venue_full"].lower(): + print("Discarding preprint : ", meta["title"]) + continue + gysela_in_title = PROJECT_NAME in meta["title"].lower() + gysela_in_abstract = PROJECT_NAME in meta["abstract"].lower() + written_by_key_author = author_matches(meta["authorships"], key_authors) + if not (gysela_in_title or gysela_in_abstract) and \ + not written_by_key_author: + print("Discarding citation : ", meta["title"], meta["authors_list"]) + continue + + if meta["doi"] in found_doi: + continue + found_doi.add(meta["doi"]) + + print("Saving :") + print(" ", meta["title"]) + print(" ", meta["authors_list"]) + if gysela_in_title or gysela_in_abstract: + print("Mentioning Gysela prominently") + if written_by_key_author: + print("Written by permanent contributor") + print() + + slug = make_slug(meta, abbrev_map) + folder = PUBLICATION_DIR / slug + folder.mkdir(parents=True, exist_ok=True) + + # Write index.md + write_index_md(folder, meta) + + # Write cite.bib + bibtex = to_bibtex(meta, slug, abbrev_map) + (folder / "cite.bib").write_text(bibtex, encoding="utf-8") + + #today = datetime.date.today().isoformat() + #save_last_run(today) + #print(f"Updated last run date to {today}") + +if __name__ == "__main__": + main() + diff --git a/scripts/venue_abbreviations.yml b/scripts/venue_abbreviations.yml new file mode 100644 index 0000000..cf15b17 --- /dev/null +++ b/scripts/venue_abbreviations.yml @@ -0,0 +1,36 @@ +Journal of Computational Physics: + slug: jcp + bibtex: "J. Comput. Phys." +Journal of Plasma Physics: + slug: jpp + bibtex: "J. Plasma Phys." +Computer Physics Communications: + slug: cpc + bibtex: "Comput. Phys. Commun." +Concurrency and Computation Practice and Experience: + slug: ccpe + bibtex: "Concurrency and Computation Practice and Experience" +Plasma Physics and Controlled Fusion: + slug: ppcf + bibtex: "Plasma Phys. Controlled Fusion" +SMAI Journal of Computational Mathematics: + slug: smai + bibtex: "SMAI Journal of Computational Mathematics" +Communications Physics: + slug: cp + bibtex: "Commun. Phys." +The International Journal of High Performance Computing Applications: + slug: ijhpca + bibtex: "Int. J. High Perform. Comput. Appl." +Physics of Plasmas: + slug: po-p + bibtex: "Phys. Plasmas" +Nuclear Fusion: + slug: nf + bibtex: "Nucl. Fusion" +Physical review. E: + slug: pre + bibtex: "Phys. Rev. E" +Physical Review Letters: + slug: prl + bibtex: "Phys. Rev. Lett." From a36401f9f5f2ac6869feae157e8c32d6ea1e0d00 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:25:18 +0200 Subject: [PATCH 02/11] Change trigger to test --- .github/workflows/check-for-new-publications.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index d08141a..b4ba888 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -5,6 +5,7 @@ on: # Runs at 03:00 on the first day of each month - cron: '0 3 1 * *' workflow_dispatch: # allows manual trigger + push: jobs: update-publications: From 6bdbf02ed5f9f524764064469f07f6238518f75c Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:26:29 +0200 Subject: [PATCH 03/11] Missing token --- .github/workflows/check-for-new-publications.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index b4ba888..c55ff0f 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -43,7 +43,7 @@ jobs: - name: Push branch run: git push origin HEAD env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ github.token }} - name: Create Pull Request run: | branch_name=$(git rev-parse --abbrev-ref HEAD) @@ -52,4 +52,5 @@ jobs: --body "Automated update of publications since ${LAST_RUN}." \ --base main \ --head $branch_name - + env: + GH_TOKEN: ${{ github.token }} From 2997923765bf43e989a83c2cc62b206a436d43a9 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:29:16 +0200 Subject: [PATCH 04/11] Increase page limit --- scripts/update_publications.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update_publications.py b/scripts/update_publications.py index 2c9952c..722cc83 100644 --- a/scripts/update_publications.py +++ b/scripts/update_publications.py @@ -175,7 +175,7 @@ def main(): params = { "search": PROJECT_NAME, "filter": f"from_publication_date:{last_run}", - "per-page": 50 + "per-page": 100 } response = requests.get(url, params=params) response.raise_for_status() From 4994785ec0068a01e0d4f46c2a9d5ef685ce5127 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:37:04 +0200 Subject: [PATCH 05/11] Exit if no publications found --- .../workflows/check-for-new-publications.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index c55ff0f..02bcc84 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -13,6 +13,7 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 + - name: Set up Python uses: actions/setup-python@v4 with: @@ -25,26 +26,41 @@ jobs: run: | last_run_iso=$(gh run list --workflow "Update Publications" --status success --limit 1 --json createdAt --jq '.[0].createdAt' 2>/dev/null || echo "") if [ -z "$last_run_iso" ]; then + echo "No last run found" last_run_iso="2019-01-01T00:00:00Z" # fallback default fi last_run=$(date -u -d "$last_run_iso" +"%Y-%m-%d") + echo "LAST_RUN=$last_run" echo "LAST_RUN=$last_run" >> $GITHUB_ENV - name: Create branch run: | branch_name="update-publications-$(date +'%Y%m%d')" - git checkout -b $branch_name echo "branch_name=${branch_name}" >> $GITHUB_ENV - name: Run publication update script run: python scripts/update_publications.py + - name: Check for changes + id: check_changes + run: | + if git diff --quiet content/publication; then + echo "No new publications found." + echo "has_new=false" >> $GITHUB_ENV + else + echo "has_new=true" >> $GITHUB_ENV + fi - name: Commit changes + if: env.has_new run: | + git checkout main + git checkout -b $branch_name git add content/publication git commit -m "Automated update of publications" || echo "No changes to commit" - name: Push branch + if: env.has_new run: git push origin HEAD env: GH_TOKEN: ${{ github.token }} - name: Create Pull Request + if: env.has_new run: | branch_name=$(git rev-parse --abbrev-ref HEAD) gh pr create \ From eb394b9f3aae9181206957b264a35afdf4097436 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:43:55 +0200 Subject: [PATCH 06/11] Ensure main is available --- .github/workflows/check-for-new-publications.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index 02bcc84..15f915d 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -13,6 +13,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v4 From ad3a34a9742e930767f3f11c8960f8da16e22a48 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:46:58 +0200 Subject: [PATCH 07/11] correct condition --- .github/workflows/check-for-new-publications.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index 15f915d..d8d4166 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -45,24 +45,24 @@ jobs: run: | if git diff --quiet content/publication; then echo "No new publications found." - echo "has_new=false" >> $GITHUB_ENV + echo "has_new=false" >> $GITHUB_OUTPUT else - echo "has_new=true" >> $GITHUB_ENV + echo "has_new=true" >> $GITHUB_OUTPUT fi - name: Commit changes - if: env.has_new + if: steps.check_new.outputs.has_new == 'true' run: | git checkout main git checkout -b $branch_name git add content/publication git commit -m "Automated update of publications" || echo "No changes to commit" - name: Push branch - if: env.has_new + if: steps.check_new.outputs.has_new == 'true' run: git push origin HEAD env: GH_TOKEN: ${{ github.token }} - name: Create Pull Request - if: env.has_new + if: steps.check_new.outputs.has_new == 'true' run: | branch_name=$(git rev-parse --abbrev-ref HEAD) gh pr create \ From 587b95eefccabd4dd7425735cbf1aaef0bf8694e Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:48:27 +0200 Subject: [PATCH 08/11] Remove test trigger --- .github/workflows/check-for-new-publications.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index d8d4166..f372a4d 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -5,7 +5,6 @@ on: # Runs at 03:00 on the first day of each month - cron: '0 3 1 * *' workflow_dispatch: # allows manual trigger - push: jobs: update-publications: From cdfbb1f01a1ff4b8300d33afca4c8ee559f9c09d Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:55:31 +0200 Subject: [PATCH 09/11] Clean up script --- scripts/update_publications.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/update_publications.py b/scripts/update_publications.py index 722cc83..7535230 100644 --- a/scripts/update_publications.py +++ b/scripts/update_publications.py @@ -10,9 +10,8 @@ PROJECT_NAME = "gysela" # change to your project keyword AUTHOR_DIR = Path(__file__).parent.parent / "content" / "authors" PUBLICATION_DIR = Path(__file__).parent.parent / "content" / "publication" -last_run=os.environ['LAST_RUN'] -#LAST_RUN_FILE = Path("last_run.txt") VENUE_ABBREVIATIONS_FILE = Path("venue_abbreviations.yml") +LAST_RUN=os.environ['LAST_RUN'] existing_slugs = {p.stem for p in PUBLICATION_DIR.iterdir() if p.is_dir()} @@ -164,7 +163,6 @@ def write_index_md(folder, meta): # === Main === def main(): - #last_run = load_last_run() abbrev_map = load_abbrev_map() key_authors = load_key_authors() @@ -174,22 +172,27 @@ def main(): url = "https://api.openalex.org/works" params = { "search": PROJECT_NAME, - "filter": f"from_publication_date:{last_run}", + "filter": f"from_publication_date:{LAST_RUN}", "per-page": 100 } response = requests.get(url, params=params) response.raise_for_status() data = response.json() results = data.get("results", []) - print(f"Found {len(results)} results for {PROJECT_NAME} since {last_run}") + print(f"Found {len(results)} results for {PROJECT_NAME} since {LAST_RUN}") for work in results: + # Discard preprints if work.get("type") == "preprint": continue + meta = extract_metadata(work, abbrev_map) + + # Discard preprints if "arxiv" in meta["venue_full"].lower(): - print("Discarding preprint : ", meta["title"]) continue + + # Check relevance gysela_in_title = PROJECT_NAME in meta["title"].lower() gysela_in_abstract = PROJECT_NAME in meta["abstract"].lower() written_by_key_author = author_matches(meta["authorships"], key_authors) @@ -198,6 +201,7 @@ def main(): print("Discarding citation : ", meta["title"], meta["authors_list"]) continue + # Discard if already found if meta["doi"] in found_doi: continue found_doi.add(meta["doi"]) @@ -222,10 +226,6 @@ def main(): bibtex = to_bibtex(meta, slug, abbrev_map) (folder / "cite.bib").write_text(bibtex, encoding="utf-8") - #today = datetime.date.today().isoformat() - #save_last_run(today) - #print(f"Updated last run date to {today}") - if __name__ == "__main__": main() From 5a632ad265ddb76e68e2df071e7d3d7c964ad139 Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 16:59:02 +0200 Subject: [PATCH 10/11] Improve key --- scripts/update_publications.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update_publications.py b/scripts/update_publications.py index 7535230..29adfd4 100644 --- a/scripts/update_publications.py +++ b/scripts/update_publications.py @@ -172,7 +172,7 @@ def main(): url = "https://api.openalex.org/works" params = { "search": PROJECT_NAME, - "filter": f"from_publication_date:{LAST_RUN}", + "filter": f"from_indexed_date:{LAST_RUN}", "per-page": 100 } response = requests.get(url, params=params) From 0ce7b981538c60e5d7ab49f7400c97a9dee03faf Mon Sep 17 00:00:00 2001 From: Emily Bourne Date: Fri, 12 Sep 2025 17:05:19 +0200 Subject: [PATCH 11/11] Add a 2 month buffer for indexing and check against existing DOIs --- .../workflows/check-for-new-publications.yml | 9 +++--- scripts/update_publications.py | 29 +++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml index f372a4d..e4029ba 100644 --- a/.github/workflows/check-for-new-publications.yml +++ b/.github/workflows/check-for-new-publications.yml @@ -30,9 +30,10 @@ jobs: echo "No last run found" last_run_iso="2019-01-01T00:00:00Z" # fallback default fi - last_run=$(date -u -d "$last_run_iso" +"%Y-%m-%d") - echo "LAST_RUN=$last_run" - echo "LAST_RUN=$last_run" >> $GITHUB_ENV + # Remove 2 months to allow lots of time for indexing + CHECK_FROM=$(date -u -d "$last_run_iso -2 months" +"%Y-%m-%d") + echo "CHECK_FROM=$CHECK_FROM" + echo "CHECK_FROM=$CHECK_FROM" >> $GITHUB_ENV - name: Create branch run: | branch_name="update-publications-$(date +'%Y%m%d')" @@ -66,7 +67,7 @@ jobs: branch_name=$(git rev-parse --abbrev-ref HEAD) gh pr create \ --title "Update publications" \ - --body "Automated update of publications since ${LAST_RUN}." \ + --body "Automated update of publications since ${CHECK_FROM}." \ --base main \ --head $branch_name env: diff --git a/scripts/update_publications.py b/scripts/update_publications.py index 29adfd4..94771e6 100644 --- a/scripts/update_publications.py +++ b/scripts/update_publications.py @@ -11,7 +11,7 @@ AUTHOR_DIR = Path(__file__).parent.parent / "content" / "authors" PUBLICATION_DIR = Path(__file__).parent.parent / "content" / "publication" VENUE_ABBREVIATIONS_FILE = Path("venue_abbreviations.yml") -LAST_RUN=os.environ['LAST_RUN'] +CHECK_FROM=os.environ['CHECK_FROM'] existing_slugs = {p.stem for p in PUBLICATION_DIR.iterdir() if p.is_dir()} @@ -24,7 +24,7 @@ def load_abbrev_map(): def load_key_authors(): key_authors = [] - for md_file in Path(AUTHOR_DIR).rglob("*.md"): + for md_file in AUTHOR_DIR.rglob("*.md"): with open(md_file, encoding="utf-8") as f: content = f.read() if content.startswith("---"): @@ -38,6 +38,18 @@ def load_key_authors(): }) return key_authors +def load_known_dois(): + dois = set() + for md_file in PUBLICATION_DIR.rglob("*.md"): + with open(md_file, encoding="utf-8") as f: + content = f.read() + if content.startswith("---"): + front_matter = content.split("---", 2)[1] + data = yaml.safe_load(front_matter) + if "doi" in data: + dois.add(data["doi"]) + return dois + def get_first_author_surname(authorships): if authorships: first_author = authorships[0]["author"]["display_name"] @@ -159,27 +171,26 @@ def write_index_md(folder, meta): "doi": meta["doi"] or "" } index_md = "---\n" + yaml.dump(front_matter, sort_keys=False) + "---\n" - (folder / "_index.md").write_text(index_md, encoding="utf-8") + (folder / "index.md").write_text(index_md, encoding="utf-8") # === Main === def main(): abbrev_map = load_abbrev_map() key_authors = load_key_authors() - - found_doi = set() + dois = load_known_dois() for PROJECT_NAME in ('gysela', 'gyselax', 'gyselalib'): url = "https://api.openalex.org/works" params = { "search": PROJECT_NAME, - "filter": f"from_indexed_date:{LAST_RUN}", + "filter": f"from_publication_date:{CHECK_FROM}", "per-page": 100 } response = requests.get(url, params=params) response.raise_for_status() data = response.json() results = data.get("results", []) - print(f"Found {len(results)} results for {PROJECT_NAME} since {LAST_RUN}") + print(f"Found {len(results)} results for {PROJECT_NAME} since {CHECK_FROM}") for work in results: # Discard preprints @@ -202,9 +213,9 @@ def main(): continue # Discard if already found - if meta["doi"] in found_doi: + if meta["doi"] in dois: continue - found_doi.add(meta["doi"]) + dois.add(meta["doi"]) print("Saving :") print(" ", meta["title"])