In [1]:
"""
This script makes some queries to PubMed and extracts author
affiliation information for the purpose of writing a rule-based
function that extracts top-level institution names.
"""

import json
from collections import Counter

from gilda import Term
from gilda.ner import annotate
from gilda.process import normalize
from indra.literature import pubmed_client
from pyobo.gilda_utils import get_grounder

with open("/Users/cthoyt/dev/rorio/countries.json") as file:
    ror_to_country_geoname = json.load(file)

organization_grounder = get_grounder("ror", skip_obsolete=True)

In [2]:
assert "041nas322" == organization_grounder.ground("University of Bonn")[0].term.id
assert "00trw9c49" == organization_grounder.ground("Fraunhofer SCAI")[0].term.id
# FIXME
# assert "00rqy9422" == organization_grounder.ground("The University of Queensland")[0].term.id

In [3]:
geonames_grounder = get_grounder("geonames")

extra_terms = [
    Term(
        normalize("United States of America"),
        "United States of America",
        "geonames",
        "6252001",
        "United States",
        "synonym",
        "geonames",
    ),
    Term(normalize("USA"), "USA", "geonames", "6252001", "United States", "synonym", "geonames"),
    Term(normalize("UK"), "UK", "geonames", "2635167", "United Kingdom", "synonym", "geonames"),
    Term(normalize("PR China"), "PR China", "geonames", "1814991", "China", "synonym", "geonames"),
    Term(
        normalize("People's Republic of China"),
        "People's Republic of China",
        "geonames",
        "1814991",
        "China",
        "synonym",
        "geonames",
    ),
    Term(normalize("Sweeden"), "Sweeden", "geonames", "2661886", "Sweden", "synonym", "geonames"),
    Term(
        normalize("Czech Republic"),
        "Czech Republic",
        "geonames",
        "3077311",
        "Czechia",
        "synonym",
        "geonames",
    ),
    Term(normalize("Brasil"), "Brasil", "geonames", "3469034", "Brazil", "synonym", "geonames"),
    Term(normalize("Brasília"), "Brasília", "geonames", "3469034", "Brazil", "synonym", "geonames"),
    Term(
        normalize("Republic of South Africa"),
        "Republic of South Africa",
        "geonames",
        "953987",
        "South Africa",
        "synonym",
        "geonames",
    ),
    Term(
        normalize("Republic of Korea"),
        "Republic of Korea",
        "geonames",
        "1835841",
        "South Korea",
        "synonym",
        "geonames",
    ),
]
for term in extra_terms:
    geonames_grounder.entries.setdefault(term.norm_text, []).append(term)

In [4]:
queries = [
    '"Steven B Bradfute"',
    '"Charles Tapley Hoyt"',
    '"Benjamin Gyori"',
]
records = {}
for query in queries:
    pmids = pubmed_client.get_ids(query, use_text_word=False)
    records.update(pubmed_client.get_metadata_for_ids(pmids, detailed_authors=True))

In [23]:
counter = Counter()
for record in records.values():
    for author in record.get("authors", []):
        for affiliation in author.get("affiliations", []):
            name = affiliation["name"]
            name = (
                name.replace("\n", " ")
                .replace("\t", " ")
                .replace(" , ", ", ")
                .replace("  ", " ")
                .strip(".")
            )
            if name[0].isdigit():
                name = name[1:].lstrip()

            # Remove trailing email
            if "@" in name.split(",")[-1]:
                name = name.rsplit(" ", 1)[0].strip().strip(":")
                name = name.rstrip(":").rstrip()
                name = name.removesuffix("Electronic address").rstrip()
                name = name.rstrip(".").rstrip()

            counter[name] += 1

Next steps:

1. Identify country of affiliation
2. Use country to filter our ROR groundings based on correct country

In [47]:
def _get_country(parts):
    last = parts[-1]
    sm = geonames_grounder.ground(last)
    if not sm:
        return None, None
    if len(sm) == 1:
        return sm[0].term.id, sm[0].term.entry_name
    # This happens for Mexico and Singapore since they have a city the same name as the country
    # in many places, there is no country written. This almost 100% USA, which sometimes just writes states
    # print("got multiple for", last)
    # print(sm)
    return None, None


def _get_country_for_ror(ror_id):
    """Return the geonames id for the ROR's country."""
    rv = ror_to_country_geoname[ror_id]
    if isinstance(rv, list):
        return rv[0]
    return rv


def _get_country_for_geonames(geonames_id):
    raise NotImplementedError


def _get_institution(parts: str, country_id: str | None, country_name):
    # don't need to look at the last bit again if country is given
    rparts = parts[:-1] if country_id else parts
    for part in reversed(rparts):
        if part.isdigit():  # zip code
            continue
        if len(part) < 3:
            continue  # probably a state
        if " " in part and part.split()[-1].isdigit():  # something plus space plus zip code
            continue
        if geonames_grounder.ground(part):  # it's a city/state/country
            continue

        matches = organization_grounder.ground(part)
        if not matches and part.endswith(")"):
            # Try stripping off some acronym in parentheses
            part = part.rsplit("(", 1)[0]
            matches = organization_grounder.ground(part)
        if not matches and part.startswith("The "):
            part = part.removeprefix("The").strip()
            matches = organization_grounder.ground(part)
        for suffix in ["School of Medicine", "College of Medicine", "Faculty of Medicine"]:
            if not matches and part.endswith(suffix):
                part = part.removeprefix(suffix).strip()
                matches = organization_grounder.ground(part)
        if not matches:
            continue
        if len(matches) == 1:
            return matches[0].term.id, matches[0].term.entry_name

        # Do some disambiguating
        if country_id is None:
            print(f"Got multiple matches but didn't have a country to disambiguate on: {part}")
            for m in matches:
                print("  https://ror.org/" + m.term.id, m.term.entry_name)
            print()
            continue

        for match in matches:
            match_country_geonames = _get_country_for_ror(match.term.id)
            if match_country_geonames == country_id:
                return match.term.id, match.term.entry_name

        print(
            f"\nfrom {parts}\n could not disambiguate '{part}' based on country '{country_name}':"
        )
        for m in matches:
            print("    https://ror.org/" + m.term.id, m.term.entry_name)
        print()

    print(f"\nfailed: {parts}", f"(country = {country_name})" if country_name else "")


from collections import defaultdict

passed = 0
failed = defaultdict(list)
for text, count in sorted(counter.items()):
    parts = [part.strip() for part in text.split(",")]
    while any(
        parts[0].startswith(part_prefix)
        for part_prefix in ("Division", "Department", "Faculty of", "Laboratory of", "Office of")
    ):
        parts = parts[1:]  # chomp away many
    country_geonames_id, country_name = _get_country(parts)
    i = _get_institution(parts, country_geonames_id, country_name)
    if i:
        passed += 1
    else:
        failed[country_name].append(text)

print(passed, sum(map(len, failed.values())))

In [50]:
Counter({country_name: len(strings) for country_name, strings in failed.items()})

In [52]:
failed[None]

In [None]:
with open("affiliations.tsv", "w") as file:
    for name, count in sorted(counter.items()):
        matches = annotate(name, grounder=grounder)
        print(matches)
        break

        print(name, count, sep="\t", file=file)


if __name__ == "__main__":
    main()