In [39]:
from indra.literature import pubmed_client
import pandas as pd
from more_itertools import batched
from tqdm.auto import tqdm
from orcid_downloader import ground_researcher, get_gilda_grounder
from collections import Counter, defaultdict
import networkx as nx
import sys
import humanize
from gilda.resources import sqlite_adapter

In [38]:
%%time
grounder = get_gilda_grounder()
print(humanize.naturalsize(sys.getsizeof(grounder.entries), binary=True))

3.7 GiB
CPU times: user 319 µs, sys: 73 µs, total: 392 µs
Wall time: 393 µs


In [44]:
import sqlite3
import json
def build(grounding_entries, path=None):
    print(f'Starting SQLite database at {path}')
    conn = sqlite3.connect(path)
    cur = conn.cursor()

    # Create the table
    print('Creating the table')
    q = "CREATE TABLE terms (norm_text text not null primary key, terms text)"
    cur.execute(q)

    # Insert terms
    print('Inserting terms')
    q = "INSERT INTO terms (norm_text, terms) VALUES (?, ?)"
    for norm_text, terms in tqdm(grounding_entries.items(), unit_scale=True):
        cur.execute(q, (norm_text, json.dumps([t.to_json() for t in terms])))

    # Build index
    print('Making index')
    q = "CREATE INDEX norm_index ON terms (norm_text);"
    cur.execute(q)
    print("committing")
    conn.commit()
    print("closing")
    conn.close()
    print("done")

In [48]:
import pickle

db_rows = [
    (norm_text, pickle.dumps(terms))
    for norm_text, terms in tqdm(grounder.entries.items(), unit_scale=True)
]

  0%|          | 0.00/108M [00:00<?, ?it/s]

KeyboardInterrupt: 

In [46]:
%%time
build(grounder.entries, path="/Users/cthoyt/.data/orcid/2023/gilda.db")

Starting SQLite database at /Users/cthoyt/.data/orcid/2023/gilda.db
Creating the table
Inserting terms


  0%|          | 0.00/108M [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
%%time
ground_researcher("charlie hoyt")

CPU times: user 15min 39s, sys: 1h 30min 29s, total: 1h 46min 8s
Wall time: 9h 10min 37s


[ScoredMatch(Term(charlie hoyt,Charlie Hoyt,orcid,0000-0003-4423-4370,Charles Tapley Hoyt,synonym,orcid,None,None,None),0.5400948258091115,Match(query=charlie hoyt,ref=Charlie Hoyt,exact=False,space_mismatch=False,dash_mismatches=set(),cap_combos=[('all_lower', 'initial_cap'), ('all_lower', 'initial_cap')]))]

In [29]:
def get_metadata_batched(pmids):
    results = {}
    for batch in tqdm(batched(sorted(pmids), 200), total=1 + len(pmids) // 200, unit="batch of 200", desc="Looking up"):
        results.update(pubmed_client.get_metadata_for_ids(batch, detailed_authors=True))
    return results


def ground_extended(first_name, last_name):
    first_name=first_name.strip()
    last_name=last_name.strip()
    names = [f"{first_name} {last_name}"]
    if " " in first_name:
        names.append(first_name.split()[0] + " " + last_name)
    for name in names:
        matches = ground_researcher(name)
        if matches:
            return matches
    return []


def process_pmid_results(rr):
    annotations = []
    ambiguous = Counter()
    misses = Counter()
    for pubmed, data in tqdm(rr.items(), unit_scale=True, desc="Grounding"):
        #print()
        #print(f"PubMed:{pubmed}")
        authors = data['authors']
        for author in authors:
            first_name = author['first_name']
            if not first_name:
                continue
            last_name = author['last_name']
            matches = ground_extended(first_name, last_name)
            if len(matches) == 1:
                annotations.append((pubmed, matches[0].term.id))
            elif matches:
                ambiguous[first_name + " " + last_name] += 1
                # print(pubmed, name, len(matches), author['affiliations'])
                # 2. if there are multiple, see if we can match any affiliations
                pass
            else:
                if "Steven" in first_name:
                    print(first_name)
                    print(last_name)
                misses[first_name + " " + last_name] += 1

    orcid_to_papers = defaultdict(set)
    for pubmed, orcid in annotations:
        orcid_to_papers[orcid].add(pubmed)
    orcid_to_papers = {k: sorted(v) for k, v in orcid_to_papers.items()}
    
    return annotations, orcid_to_papers, ambiguous, misses

In [30]:
BRADFUTE_ORCID = '0000-0002-1985-751X'
bradfute_pmids = pubmed_client.get_ids('"Steven B Bradfute"', use_text_word=False)
bradfute_records = get_metadata_batched(bradfute_pmids)

(
    bradfute_annotations, 
    bradfute_orcid_to_paper, 
    bradfute_ambiguous,
    bradfute_misses,
) = process_pmid_results(bradfute_records)

print(
    f"There are {len(bradfute_annotations):,} paper-author annotations "
    f"with {len(bradfute_orcid_to_paper)} unique authors and {len(bradfute_pmids):,} "
    f"papers."
)

n_bradfute_annotated = len(bradfute_orcid_to_paper[BRADFUTE_ORCID])
print(
    f"There are {n_bradfute_annotated:,} ({n_bradfute_annotated/len(bradfute_pmids):.1%}) "
    f"papers annotated with Bradfute."
)

Looking up:   0%|          | 0/1 [00:00<?, ?batch of 200/s]

Grounding:   0%|          | 0.00/76.0 [00:00<?, ?it/s]

There are 882 paper-author annotations with 491 unique authors and 76 papers.
There are 72 (94.7%) papers annotated with Bradfute.


In [31]:
from IPython.display import Markdown
Markdown(pd.DataFrame(bradfute_misses.most_common()).to_markdown())

|    | 0                           |   1 |
|---:|:----------------------------|----:|
|  0 | Kelly S Stuthman            |   4 |
|  1 | María A Ayllón              |   3 |
|  2 | Ali Mirazimi                |   3 |
|  3 | Rayapati Naidu              |   3 |
|  4 | Jennifer Pocius             |   2 |
|  5 | Lloyd H Michael             |   2 |
|  6 | William C Raschke           |   2 |
|  7 | Siham Nakamura              |   2 |
|  8 | Amy C Shurtleff             |   2 |
|  9 | Sarah Yarborough            |   2 |
| 10 | Christine Merle             |   2 |
| 11 | Manfred Theisen             |   2 |
| 12 | Elena Dal Bó                |   2 |
| 13 | Kurt C Schwalm              |   2 |
| 14 | David S Peabody             |   2 |
| 15 | Alexandra Serris            |   2 |
| 16 | Teresa D Gallardo           |   1 |
| 17 | Vinciane Gaussin            |   1 |
| 18 | Xuan Chi                    |   1 |
| 19 | Andrew J McCooey            |   1 |
| 20 | Alan Poindexter             |   1 |
| 21 | Rahshaana Green             |   1 |
| 22 | Kathyjo A Jackson           |   1 |
| 23 | Gretchen Darlington         |   1 |
| 24 | Denise R Braun              |   1 |
| 25 | Joan B Geisbert             |   1 |
| 26 | Jason Paragas               |   1 |
| 27 | Derron A Alves              |   1 |
| 28 | Sean VanTongeren            |   1 |
| 29 | Warren V Kalina             |   1 |
| 30 | Megan P Tierney             |   1 |
| 31 | Olga Sirin                  |   1 |
| 32 | Mehveen G Merchant          |   1 |
| 33 | Ricky L Ulrich              |   1 |
| 34 | Loreen Lofts                |   1 |
| 35 | Meagan T Cooper             |   1 |
| 36 | Sean A VanTongeren          |   1 |
| 37 | Christine A Mech            |   1 |
| 38 | Mayumi Matsuoka             |   1 |
| 39 | Jerome Jacques              |   1 |
| 40 | Sufi Morshed                |   1 |
| 41 | Lian Dong                   |   1 |
| 42 | Jacqueline D Gearhart       |   1 |
| 43 | Cary Retterer               |   1 |
| 44 | Marc A Hogenbirk            |   1 |
| 45 | Esteban Roberts             |   1 |
| 46 | Sharon Master               |   1 |
| 47 | Prafullakumar Tailor        |   1 |
| 48 | Cathleen M Lind             |   1 |
| 49 | Lori T Eccleston            |   1 |
| 50 | Kevin B Spurgers            |   1 |
| 51 | Radha K Maheshwari          |   1 |
| 52 | Michal H Mudd               |   1 |
| 53 | Yangsheng Yu                |   1 |
| 54 | Lindsey G Luo               |   1 |
| 55 | Michael A Drebot            |   1 |
| 56 | Serpil Karadağ              |   1 |
| 57 | William Marciel de Souza    |   1 |
| 58 | Giovanni P Martelli         |   1 |
| 59 | Nicole Mielke-Ehret         |   1 |
| 60 | Hans-Peter Mühlbach         |   1 |
| 61 | Bertus K Rima               |   1 |
| 62 | Yukio Shirako               |   1 |
| 63 | Jana Širmarová              |   1 |
| 64 | David G Grenache            |   1 |
| 65 | Surekha Surendranathan      |   1 |
| 66 | Nathan Donart               |   1 |
| 67 | Ivy Foo-Hurwitz             |   1 |
| 68 | M Lisa Phipps               |   1 |
| 69 | Chris J Sheehan             |   1 |
| 70 | Michal Mudd                 |   1 |
| 71 | Janae Martinez              |   1 |
| 72 | Phillip Seidenberg          |   1 |
| 73 | Claudette Feuvrier          |   1 |
| 74 | Hidenori Horikawa           |   1 |
| 75 | François Lieffrig           |   1 |
| 76 | Laurane Pallandre           |   1 |
| 77 | Françoise Pozet             |   1 |
| 78 | Kenta Tsunekawa             |   1 |
| 79 | Junki Yamasaki              |   1 |
| 80 | Boris Julg                  |   1 |
| 81 | Erin Quinlan                |   1 |
| 82 | Marila Gennaro              |   1 |
| 83 | Rachel Atchley-Challenner   |   1 |
| 84 | Urania Argueta              |   1 |
| 85 | Sylvia I Davila Nieves      |   1 |
| 86 | Vivian Gainer               |   1 |
| 87 | Minjoung Go                 |   1 |
| 88 | Janko Z Nikolich            |   1 |
| 89 | Elizabeth Ojemakinde        |   1 |
| 90 | Russel R Bakken             |   1 |
| 91 | Virendra Kumar Baranwal     |   1 |
| 92 | Kar Mun Chooi               |   1 |
| 93 | JoëlleGoüy de Bellocq       |   1 |
| 94 | Ines Günther                |   1 |
| 95 | Yusuke Hasegawa             |   1 |
| 96 | Venkidusamy Kavi Sidharthan |   1 |
| 97 | Akio Tatara                 |   1 |
| 98 | Kazutaka Yano               |   1 |

In [32]:
from IPython.display import Markdown
Markdown(pd.DataFrame(bradfute_ambiguous.most_common()).to_markdown())

|     | 0                      |   1 |
|----:|:-----------------------|----:|
|   0 | Chunyan Ye             |  16 |
|   1 | Elizabeth C Clarke     |   8 |
|   2 | Kartik Chandran        |   7 |
|   3 | John M Dye             |   5 |
|   4 | Scott M Anthony        |   5 |
|   5 | Douglas J Perkins      |   4 |
|   6 | Gustavo Palacios       |   4 |
|   7 | Natarajan Ayithan      |   3 |
|   8 | Martin Beer            |   3 |
|   9 | Éric Bergeron          |   3 |
|  10 | Inmaculada Casas       |   3 |
|  11 | Ralf Dürrwald          |   3 |
|  12 | Andrew J Easton        |   3 |
|  13 | María Laura García     |   3 |
|  14 | Anthony Griffiths      |   3 |
|  15 | Stephan Günther        |   3 |
|  16 | John Hammond           |   3 |
|  17 | Holly R Hughes         |   3 |
|  18 | Amy J Lambert          |   3 |
|  19 | Jiànróng Lǐ            |   3 |
|  20 | Sergio H Marshall      |   3 |
|  21 | John W McCauley        |   3 |
|  22 | José A Navarro         |   3 |
|  23 | Anna Papa              |   3 |
|  24 | Daniel R Pérez         |   3 |
|  25 | Florian Pfaff          |   3 |
|  26 | Martin Schwemmle       |   3 |
|  27 | Jin-Won Song           |   3 |
|  28 | Nikos Vasilakis        |   3 |
|  29 | Peter J Walker         |   3 |
|  30 | Yong-Zhen Zhang        |   3 |
|  31 | Yan Guo                |   3 |
|  32 | Daniel K Reed          |   2 |
|  33 | Arthur O Anderson      |   2 |
|  34 | Krishna Kota           |   2 |
|  35 | Michael A Mandell      |   2 |
|  36 | Santosh Chauhan        |   2 |
|  37 | Russell R Bakken       |   2 |
|  38 | Eduardo Anaya          |   2 |
|  39 | Ravi Durvasula         |   2 |
|  40 | Zachary R Stromberg    |   2 |
|  41 | Qiuying Cheng          |   2 |
|  42 | Gregory J Mertz        |   2 |
|  43 | Suresh Kumar           |   2 |
|  44 | Ivan V Kuzmin          |   2 |
|  45 | Beatriz Navarro        |   2 |
|  46 | Maria S Salvato        |   2 |
|  47 | John V Williams        |   2 |
|  48 | F Murilo Zerbini       |   2 |
|  49 | Joseph A Cook          |   2 |
|  50 | Michelle Harkins       |   2 |
|  51 | Andrew M Skidmore      |   2 |
|  52 | Markus Keller          |   2 |
|  53 | Felix A Rey            |   2 |
|  54 | Andrew S Herbert       |   2 |
|  55 | J Felix Drexler        |   2 |
|  56 | Guozhong Feng          |   2 |
|  57 | Seiji Hongō            |   2 |
|  58 | Kenji Kubota           |   2 |
|  59 | Jun-Min Li             |   2 |
|  60 | Yutaro Neriya          |   2 |
|  61 | Sofia Paraskevopoulou  |   2 |
|  62 | Benjamin Chen          |   2 |
|  63 | Teruya Nakamura        |   1 |
|  64 | Robert J Schwartz      |   1 |
|  65 | Carlos A Ramos         |   1 |
|  66 | Patricia Fonseca       |   1 |
|  67 | Daniela M Oliveira     |   1 |
|  68 | Yayun Zheng            |   1 |
|  69 | David Steffen          |   1 |
|  70 | Kuan-Yin K Lin         |   1 |
|  71 | Alice J Chen           |   1 |
|  72 | C Joseph Fisk          |   1 |
|  73 | D Anthony Alves        |   1 |
|  74 | Tsung-Hsien Chang      |   1 |
|  75 | Steven Jones           |   1 |
|  76 | Mark A Smith           |   1 |
|  77 | Richard S Hotchkiss    |   1 |
|  78 | Steven C Wood          |   1 |
|  79 | Xiaoli Chi             |   1 |
|  80 | John N Misasi          |   1 |
|  81 | James M Cunningham     |   1 |
|  82 | M Javad Aman           |   1 |
|  83 | Nicolas Dupont         |   1 |
|  84 | Tom Egil Hansen        |   1 |
|  85 | Michael Mandell        |   1 |
|  86 | Diana Fisher           |   1 |
|  87 | Jeffrey W Cohen        |   1 |
|  88 | Zahra Ahmed            |   1 |
|  89 | Tomonori Kimura        |   1 |
|  90 | Anna Waller            |   1 |
|  91 | Kiran Bhaskar          |   1 |
|  92 | Britney Martinez       |   1 |
|  93 | Christopher L Cooper   |   1 |
|  94 | Guangshun Wang         |   1 |
|  95 | Yan Liu                |   1 |
|  96 | Amanda K Zhang         |   1 |
|  97 | St Patrick Reid        |   1 |
|  98 | Jay S Raval            |   1 |
|  99 | Michelle S Harkins     |   1 |
| 100 | Simon J Anthony        |   1 |
| 101 | Tomáš Bartonička       |   1 |
| 102 | Brian H Bird           |   1 |
| 103 | Carol Blair            |   1 |
| 104 | John Chamberlain       |   1 |
| 105 | Biao Chen              |   1 |
| 106 | J Christopher S Clegg  |   1 |
| 107 | Ian Crozier            |   1 |
| 108 | Alberto M R Dávila     |   1 |
| 109 | Mohamed Hassan         |   1 |
| 110 | Bernd Hoffmann         |   1 |
| 111 | Seiji Hongo            |   1 |
| 112 | Rodrigo Jardim         |   1 |
| 113 | Qi Jin                 |   1 |
| 114 | Karen E Keller         |   1 |
| 115 | Robert A Lamb          |   1 |
| 116 | Eric M Leroy           |   1 |
| 117 | Dexin Li               |   1 |
| 118 | Mifang Liang           |   1 |
| 119 | Wénwén Liú             |   1 |
| 120 | Yàn Liú                |   1 |
| 121 | Robert R Martin        |   1 |
| 122 | Sébastien Massart      |   1 |
| 123 | Renata C Oliveira      |   1 |
| 124 | Susan Payne            |   1 |
| 125 | Amadou Alpha Sall      |   1 |
| 126 | Muhammad Z Shabbir     |   1 |
| 127 | Xiǎohóng Shí           |   1 |
| 128 | Zhènglì Shí            |   1 |
| 129 | Peter Simmonds         |   1 |
| 130 | David M Stone          |   1 |
| 131 | Petra Straková         |   1 |
| 132 | Hui Wang               |   1 |
| 133 | Jianwei Wang           |   1 |
| 134 | Xifeng Wang            |   1 |
| 135 | Lin-Fa Wang            |   1 |
| 136 | Tàiyún Wèi             |   1 |
| 137 | Heather Wells          |   1 |
| 138 | Zhìqiáng Wú            |   1 |
| 139 | Xin Yang               |   1 |
| 140 | Xuejie Yu              |   1 |
| 141 | Tong Zhang             |   1 |
| 142 | Guohui Zhou            |   1 |
| 143 | Xueping Zhou           |   1 |
| 144 | Matthew J Garcia       |   1 |
| 145 | Shruti Bansal          |   1 |
| 146 | Matthew N Rush         |   1 |
| 147 | Jennifer S Martinez    |   1 |
| 148 | Maria Eugenia Dieterle |   1 |
| 149 | Ruheena Javed          |   1 |
| 150 | Ashish Jain            |   1 |
| 151 | Karthikeyan Tangavelou |   1 |
| 152 | Jing Li                |   1 |
| 153 | Shuguang Leng          |   1 |
| 154 | Jeremy Edwards         |   1 |
| 155 | Gregory Mertz          |   1 |
| 156 | Mark Unruh             |   1 |
| 157 | J Pedro Teixeira       |   1 |
| 158 | Rémy Bruggmann         |   1 |
| 159 | Mark Crane             |   1 |
| 160 | Dongsheng Luo          |   1 |
| 161 | Peter G Mohr           |   1 |
| 162 | Richard N Morrison     |   1 |
| 163 | Stephen R Welch        |   1 |
| 164 | Jana M Ritter          |   1 |
| 165 | Katherine A Davies     |   1 |
| 166 | Alexandra Fowler       |   1 |
| 167 | Christian R Gomez      |   1 |
| 168 | Thomas J Connors       |   1 |
| 169 | William Brian Reeves   |   1 |
| 170 | Michael A Portman      |   1 |
| 171 | Sarah E Jolley         |   1 |
| 172 | Bruce Levy             |   1 |
| 173 | Upinder Singh          |   1 |
| 174 | Carolin C M Schulte    |   1 |
| 175 | George A Alba          |   1 |
| 176 | Shamik Bhattacharyya   |   1 |
| 177 | Hector Bonilla         |   1 |
| 178 | Mario Castro           |   1 |
| 179 | James Chan             |   1 |
| 180 | Peter Chen             |   1 |
| 181 | Helen Y Chu            |   1 |
| 182 | Rebecca G Clifton      |   1 |
| 183 | Vivian Fonseca         |   1 |
| 184 | Kelly S Gibson         |   1 |
| 185 | Jenny E Han            |   1 |
| 186 | James Heath            |   1 |
| 187 | Carla Hernandez        |   1 |
| 188 | Rachel Hess            |   1 |
| 189 | Susan E Hoover         |   1 |
| 190 | Beatrice Huang         |   1 |
| 191 | Brenna L Hughes        |   1 |
| 192 | Janice John            |   1 |
| 193 | Michael R Jordan       |   1 |
| 194 | John D Kelly           |   1 |
| 195 | Jonathan D Klein       |   1 |
| 196 | Allison A Lambert      |   1 |
| 197 | Michele T Longo        |   1 |
| 198 | Carlos A Luciano       |   1 |
| 199 | Jason H Maley          |   1 |
| 200 | Hector Mendez-Figueroa |   1 |
| 201 | Shawn N Murphy         |   1 |
| 202 | Robert B Neuman        |   1 |
| 203 | Anna Palatnik          |   1 |
| 204 | Samuel Parry           |   1 |
| 205 | Thomas F Patterson     |   1 |
| 206 | John G Quigley         |   1 |
| 207 | Uma Reddy              |   1 |
| 208 | Rebecca Reece          |   1 |
| 209 | W B Reeves             |   1 |
| 210 | Dwight J Rouse         |   1 |
| 211 | Jeffrey A Sparks       |   1 |
| 212 | Barbara S Taylor       |   1 |
| 213 | John M Thorp           |   1 |
| 214 | Katherine R Tuttle     |   1 |
| 215 | Zachary S Wallace      |   1 |
| 216 | Steven J Weiner        |   1 |
| 217 | Lynn M Yee             |   1 |
| 218 | Yan Sun                |   1 |
| 219 | Junya Abe              |   1 |
| 220 | Wenjie Gong            |   1 |
| 221 | Colleen M Higgins      |   1 |
| 222 | Kenichi Ikeda          |   1 |
| 223 | Ran Liu                |   1 |
| 224 | Tomoyuki Okada         |   1 |
| 225 | Xin Tian               |   1 |
| 226 | Han Xia                |   1 |
| 227 | Zhiming Yuan           |   1 |
| 228 | Guilin Zhang           |   1 |
| 229 | Song Zhang             |   1 |
| 230 | Lu Zhao                |   1 |
| 231 | C Kim                  |   1 |
| 232 | K Coombs               |   1 |

In [50]:
lookup_counter = Counter(
    len(x)
    for x in tqdm(grounder.entries.values(), unit_scale=True)
)

  0%|          | 0.00/108M [00:00<?, ?it/s]

In [8]:
pmids = set(
    pd.read_csv(
        "/Users/cthoyt/dev/kestrel/src/kestrel/ner/pubmed/pathogen_platform_pubmed.tsv", 
        sep='\t', usecols=[3]
    )['pubmed']
)
len(pmids)

7824