In [39]:
import sys
from collections import Counter, defaultdict

import humanize
import networkx as nx
import pandas as pd
from gilda.resources import sqlite_adapter
from indra.literature import pubmed_client
from more_itertools import batched
from tqdm.auto import tqdm

from orcid_downloader import get_gilda_grounder, ground_researcher

In [38]:
%%time
grounder = get_gilda_grounder()
print(humanize.naturalsize(sys.getsizeof(grounder.entries), binary=True))

In [44]:
import json
import sqlite3


def build(grounding_entries, path=None):
    print(f"Starting SQLite database at {path}")
    conn = sqlite3.connect(path)
    cur = conn.cursor()

    # Create the table
    print("Creating the table")
    q = "CREATE TABLE terms (norm_text text not null primary key, terms text)"
    cur.execute(q)

    # Insert terms
    print("Inserting terms")
    q = "INSERT INTO terms (norm_text, terms) VALUES (?, ?)"
    for norm_text, terms in tqdm(grounding_entries.items(), unit_scale=True):
        cur.execute(q, (norm_text, json.dumps([t.to_json() for t in terms])))

    # Build index
    print("Making index")
    q = "CREATE INDEX norm_index ON terms (norm_text);"
    cur.execute(q)
    print("committing")
    conn.commit()
    print("closing")
    conn.close()
    print("done")

In [48]:
import pickle

db_rows = [
    (norm_text, pickle.dumps(terms))
    for norm_text, terms in tqdm(grounder.entries.items(), unit_scale=True)
]

In [46]:
%%time
build(grounder.entries, path="/Users/cthoyt/.data/orcid/2023/gilda.db")

In [2]:
%%time
ground_researcher("charlie hoyt")

In [29]:
def get_metadata_batched(pmids):
    results = {}
    for batch in tqdm(
        batched(sorted(pmids), 200),
        total=1 + len(pmids) // 200,
        unit="batch of 200",
        desc="Looking up",
    ):
        results.update(pubmed_client.get_metadata_for_ids(batch, detailed_authors=True))
    return results


def ground_extended(first_name, last_name):
    first_name = first_name.strip()
    last_name = last_name.strip()
    names = [f"{first_name} {last_name}"]
    if " " in first_name:
        names.append(first_name.split()[0] + " " + last_name)
    for name in names:
        matches = ground_researcher(name)
        if matches:
            return matches
    return []


def process_pmid_results(rr):
    annotations = []
    ambiguous = Counter()
    misses = Counter()
    for pubmed, data in tqdm(rr.items(), unit_scale=True, desc="Grounding"):
        # print()
        # print(f"PubMed:{pubmed}")
        authors = data["authors"]
        for author in authors:
            first_name = author["first_name"]
            if not first_name:
                continue
            last_name = author["last_name"]
            matches = ground_extended(first_name, last_name)
            if len(matches) == 1:
                annotations.append((pubmed, matches[0].term.id))
            elif matches:
                ambiguous[first_name + " " + last_name] += 1
                # print(pubmed, name, len(matches), author['affiliations'])
                # 2. if there are multiple, see if we can match any affiliations
                pass
            else:
                if "Steven" in first_name:
                    print(first_name)
                    print(last_name)
                misses[first_name + " " + last_name] += 1

    orcid_to_papers = defaultdict(set)
    for pubmed, orcid in annotations:
        orcid_to_papers[orcid].add(pubmed)
    orcid_to_papers = {k: sorted(v) for k, v in orcid_to_papers.items()}

    return annotations, orcid_to_papers, ambiguous, misses

In [30]:
BRADFUTE_ORCID = "0000-0002-1985-751X"
bradfute_pmids = pubmed_client.get_ids('"Steven B Bradfute"', use_text_word=False)
bradfute_records = get_metadata_batched(bradfute_pmids)

(
    bradfute_annotations,
    bradfute_orcid_to_paper,
    bradfute_ambiguous,
    bradfute_misses,
) = process_pmid_results(bradfute_records)

print(
    f"There are {len(bradfute_annotations):,} paper-author annotations "
    f"with {len(bradfute_orcid_to_paper)} unique authors and {len(bradfute_pmids):,} "
    f"papers."
)

n_bradfute_annotated = len(bradfute_orcid_to_paper[BRADFUTE_ORCID])
print(
    f"There are {n_bradfute_annotated:,} ({n_bradfute_annotated/len(bradfute_pmids):.1%}) "
    f"papers annotated with Bradfute."
)

In [31]:
from IPython.display import Markdown

Markdown(pd.DataFrame(bradfute_misses.most_common()).to_markdown())

In [32]:
from IPython.display import Markdown

Markdown(pd.DataFrame(bradfute_ambiguous.most_common()).to_markdown())

In [50]:
lookup_counter = Counter(len(x) for x in tqdm(grounder.entries.values(), unit_scale=True))

In [8]:
pmids = set(
    pd.read_csv(
        "/Users/cthoyt/dev/kestrel/src/kestrel/ner/pubmed/pathogen_platform_pubmed.tsv",
        sep="\t",
        usecols=[3],
    )["pubmed"]
)
len(pmids)