This script makes some queries to PubMed and extracts author affiliation information for the purpose of writing a rule-based function that extracts top-level institution names.

In [1]:
from collections import Counter, defaultdict

from gilda import Term
from gilda.process import normalize
from indra.literature import pubmed_client
from pyobo.gilda_utils import get_grounder
from pyobo.sources.ror import get_ror_to_country_geonames

ror_to_country_geoname = get_ror_to_country_geonames()

organization_grounder = get_grounder("ror", skip_obsolete=True)

INFO: [2024-09-23 13:51:35] pyobo.sources.geonames - got 252 countries
INFO: [2024-09-23 13:51:35] pyobo.sources.geonames - got 252 country records


ror:   0%|          | 0.00/107k [00:00<?, ?record/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[ror] mapping:   0%|          | 0.00/107k [00:00<?, ?name/s]

[ror] mapping:   0%|          | 0.00/65.1k [00:00<?, ?synonym/s]

INFO: [2024-09-23 13:51:49] gilda.term - Filtering 209567 terms for uniqueness...
INFO: [2024-09-23 13:51:49] gilda.term - Got 209435 unique terms...


In [2]:
if "041nas322" != organization_grounder.ground("University of Bonn")[0].term.id:
    raise AssertionError
if "00trw9c49" != organization_grounder.ground("Fraunhofer SCAI")[0].term.id:
    raise AssertionError

# FIXME
queensland_matches = organization_grounder.ground("The University of Queensland")
# assert "00rqy9422" == queensland_matches[0].term.id # noqa:ERA001

In [3]:
geonames_grounder = get_grounder("geonames")

extra_terms = [
    Term(
        normalize("United States of America"),
        "United States of America",
        "geonames",
        "6252001",
        "United States",
        "synonym",
        "geonames",
    ),
    Term(normalize("USA"), "USA", "geonames", "6252001", "United States", "synonym", "geonames"),
    Term(normalize("UK"), "UK", "geonames", "2635167", "United Kingdom", "synonym", "geonames"),
    Term(normalize("PR China"), "PR China", "geonames", "1814991", "China", "synonym", "geonames"),
    Term(
        normalize("People's Republic of China"),
        "People's Republic of China",
        "geonames",
        "1814991",
        "China",
        "synonym",
        "geonames",
    ),
    Term(normalize("Sweeden"), "Sweeden", "geonames", "2661886", "Sweden", "synonym", "geonames"),
    Term(
        normalize("Czech Republic"),
        "Czech Republic",
        "geonames",
        "3077311",
        "Czechia",
        "synonym",
        "geonames",
    ),
    Term(normalize("Brasil"), "Brasil", "geonames", "3469034", "Brazil", "synonym", "geonames"),
    Term(normalize("Brasília"), "Brasília", "geonames", "3469034", "Brazil", "synonym", "geonames"),
    Term(
        normalize("Republic of South Africa"),
        "Republic of South Africa",
        "geonames",
        "953987",
        "South Africa",
        "synonym",
        "geonames",
    ),
    Term(
        normalize("Republic of Korea"),
        "Republic of Korea",
        "geonames",
        "1835841",
        "South Korea",
        "synonym",
        "geonames",
    ),
]
for term in extra_terms:
    geonames_grounder.entries.setdefault(term.norm_text, []).append(term)

  0%|          | 0/1 [00:00<?, ?it/s]

[geonames] mapping:   0%|          | 0.00/10.2k [00:00<?, ?name/s]

[geonames] mapping:   0%|          | 0.00/5.32k [00:00<?, ?synonym/s]

INFO: [2024-09-23 13:51:50] gilda.term - Filtering 116720 terms for uniqueness...
INFO: [2024-09-23 13:51:50] gilda.term - Got 111853 unique terms...


In [4]:
queries = [
    '"Steven B Bradfute"',
    '"Charles Tapley Hoyt"',
    '"Benjamin Gyori"',
]
records = {}
for query in queries:
    pmids = pubmed_client.get_ids(query, use_text_word=False)
    records.update(pubmed_client.get_metadata_for_ids(pmids, detailed_authors=True))

In [5]:
counter = Counter()
for record in records.values():
    for author in record.get("authors", []):
        for affiliation in author.get("affiliations", []):
            name = affiliation["name"]
            name = (
                name.replace("\n", " ")
                .replace("\t", " ")
                .replace(" , ", ", ")
                .replace("  ", " ")
                .strip(".")
            )
            if name[0].isdigit():
                name = name[1:].lstrip()

            # Remove trailing email
            if "@" in name.split(",")[-1]:
                name = name.rsplit(" ", 1)[0].strip().strip(":")
                name = name.rstrip(":").rstrip()
                name = name.removesuffix("Electronic address").rstrip()
                name = name.rstrip(".").rstrip()

            counter[name] += 1

Next steps:

1. Identify country of affiliation
2. Use country to filter our ROR groundings based on correct country

In [6]:
def _get_country(parts):
    last = parts[-1]
    sm = geonames_grounder.ground(last)
    if not sm:
        return None, None
    if len(sm) == 1:
        return sm[0].term.id, sm[0].term.entry_name
    # This happens for Mexico and Singapore since they have a city the same name
    # as the country in many places, there is no country written. This almost 100%
    # USA, which sometimes just writes states
    return None, None


def _get_country_for_geonames(geonames_id):
    raise NotImplementedError


def _get_institution(parts: str, country_id: str | None, country_name):  # noqa:C901
    # don't need to look at the last bit again if country is given
    rparts = parts[:-1] if country_id else parts
    for part in reversed(rparts):
        if part.isdigit():  # zip code
            continue
        if len(part) < 3:
            continue  # probably a state
        if " " in part and part.split()[-1].isdigit():  # something plus space plus zip code
            continue
        if geonames_grounder.ground(part):  # it's a city/state/country
            continue

        matches = organization_grounder.ground(part)
        if not matches and part.endswith(")"):
            # Try stripping off some acronym in parentheses
            part = part.rsplit("(", 1)[0]
            matches = organization_grounder.ground(part)
        if not matches and part.startswith("The "):
            part = part.removeprefix("The").strip()
            matches = organization_grounder.ground(part)
        for suffix in ["School of Medicine", "College of Medicine", "Faculty of Medicine"]:
            if not matches and part.endswith(suffix):
                part = part.removeprefix(suffix).strip()
                matches = organization_grounder.ground(part)
        if not matches:
            continue
        if len(matches) == 1:
            return matches[0].term.id, matches[0].term.entry_name

        # Do some disambiguating
        if country_id is None:
            print(f"Got multiple matches but didn't have a country to disambiguate on: {part}")
            for m in matches:
                print("  https://ror.org/" + m.term.id, m.term.entry_name)
            print()
            continue

        for match in matches:
            match_country_geonames = ror_to_country_geoname.get(match.term.id)
            if match_country_geonames == country_id:
                return match.term.id, match.term.entry_name

        print(
            f"\nfrom {parts}\n could not disambiguate '{part}' based on country '{country_name}':"
        )
        for m in matches:
            print("    https://ror.org/" + m.term.id, m.term.entry_name)
        print()

    print(f"\nfailed: {parts}", f"(country = {country_name})" if country_name else "")


passed = 0
failed = defaultdict(list)
for text in sorted(counter):
    parts = [part.strip() for part in text.split(",")]
    while any(
        parts[0].startswith(part_prefix)
        for part_prefix in ("Division", "Department", "Faculty of", "Laboratory of", "Office of")
    ):
        parts = parts[1:]  # chomp away many
    country_geonames_id, country_name = _get_country(parts)
    i = _get_institution(parts, country_geonames_id, country_name)
    if i:
        passed += 1
    else:
        failed[country_name].append(text)

print(passed, sum(map(len, failed.values())))


failed: ['Ada Health GmbH', 'Berlin', 'Germany'] (country = Germany)

failed: ['Adagio Therapeutics Inc.', 'Waltham', 'MA 02451', 'USA'] (country = United States)

failed: ['Adimab LLC', 'Lebanon', 'NH 03766', 'USA'] (country = United States)

failed: ['Aichi Agricultural Research Center', 'Nagakute', 'Aichi', 'Japan'] (country = Japan)

failed: ['Altdeep Boston MA 02115 USA'] 

failed: ['AnacletoLab', 'Dipartimento di Informatica', 'Universit`a degli Studi di Milano', 'Via Celoria 18', '20133', 'Milan', 'Italy'] (country = Italy)

failed: ['Answer ALS Consortium', 'LA', 'CA 70184', 'USA'] (country = United States)

failed: ['Antibody Discovery', 'Mapp Biopharmaceutical', 'San Diego', 'CA', 'USA'] (country = United States)

failed: ['Applied Tumour Immunity Clinical Cooperation Unit', 'National Centre for Tumour Diseases (NCT)', 'German Cancer Research Centre (DKFZ)', 'Heidelberg', 'Germany'] (country = Germany)

failed: ['Autophagy Inflammation and Metabolism Center of Biomedical Res

In [7]:
Counter({country_name: len(strings) for country_name, strings in failed.items()})

Counter({'United States': 123,
         None: 52,
         'Germany': 24,
         'United Kingdom': 9,
         'France': 9,
         'China': 8,
         'Japan': 7,
         'The Netherlands': 7,
         'Italy': 5,
         'Brazil': 5,
         'Belgium': 4,
         'Chile': 4,
         'Massachusetts': 3,
         'Ohio': 3,
         'India': 3,
         'Czechia': 3,
         'Spain': 3,
         'Argentina': 2,
         'Australia': 2,
         'California': 2,
         'Turkey': 2,
         'Illinois': 2,
         'Austria': 2,
         'Guinea': 2,
         'Atlanta': 2,
         'Canada': 2,
         'Sweden': 2,
         'Maryland': 2,
         'Greece': 2,
         'Louisiana': 2,
         'Salt Lake City': 2,
         'Arizona': 1,
         'South Africa': 1,
         'Russia': 1,
         'Switzerland': 1,
         'Sierra Leone': 1,
         'Finland': 1,
         'Hungary': 1,
         'Norway': 1,
         'Ghana': 1,
         'New Zealand': 1,
         'Tucson': 1,

In [8]:
failed[None]

['Altdeep Boston MA 02115 USA',
 'Banner University Medical Center, Tucson, Arizona (H.H.)',
 'CORe Community Inc., and Department of Epidemiology, Columbia University Mailman School of Public Health, New York, New York (M.H.)',
 'Christiana Care Health Services Inc, Newark, Delaware',
 'Department of Emergency Medicine, University of Illinois Chicago, Chicago, Illinois (J.Y.L.)',
 'Department of Immunobiology, University of Arizona College of Medicine-Tucson, and Arizona Center on Aging, Tucson, Arizona (J.ŽN.)',
 'Department of Medicine, Division of Gastroenterology, Howard University College of Medicine, Washington, DC (A.O.L.)',
 'Department of Medicine, Division of Hematology/Oncology, University of Illinois Chicago, Chicago, Illinois (J.G.Q.)',
 'Department of Medicine, Division of Infectious Diseases, Boston University Medical Campus, Boston, Massachusetts (J.M.)',
 'Department of Medicine, Division of Infectious Diseases, University of California San Francisco, San Francisco, C