In [1]:
from collections import Counter, defaultdict

import pandas as pd
from indra.literature import pubmed_client
from IPython.display import Markdown
from more_itertools import batched
from tqdm.auto import tqdm

from orcid_downloader import ground_researcher

In [2]:
def _get_metadata_batched(pmids):
    results = {}
    for batch in tqdm(
        batched(sorted(pmids), 200),
        total=1 + len(pmids) // 200,
        unit="batch of 200",
        desc="Looking up",
    ):
        results.update(pubmed_client.get_metadata_for_ids(batch, detailed_authors=True))
    return results


def _process_pmid_results(rr):
    annotations = []
    ambiguous = Counter()
    misses = Counter()
    for pubmed, data in tqdm(rr.items(), unit_scale=True, desc="Grounding"):
        authors = data["authors"]
        for author in authors:
            first_name = author["first_name"]
            if not first_name:
                continue
            last_name = author["last_name"]
            matches = ground_researcher(f"{first_name} {last_name}")
            if len(matches) == 1:
                annotations.append((pubmed, matches[0].term.id))
            elif matches:
                ambiguous[first_name + " " + last_name] += 1
                # print(pubmed, name, len(matches), author['affiliations']) # noqa:ERA001
                # 2. if there are multiple, see if we can match any affiliations
                pass
            else:
                if "Steven" in first_name:
                    print(first_name)
                    print(last_name)
                misses[first_name + " " + last_name] += 1

    orcid_to_papers = defaultdict(set)
    for pubmed, orcid in annotations:
        orcid_to_papers[orcid].add(pubmed)
    orcid_to_papers = {k: sorted(v) for k, v in orcid_to_papers.items()}

    return annotations, orcid_to_papers, ambiguous, misses

In [3]:
BRADFUTE_ORCID = "0000-0002-1985-751X"
bradfute_pmids = pubmed_client.get_ids('"Steven B Bradfute"', use_text_word=False)
bradfute_records = _get_metadata_batched(bradfute_pmids)

(
    bradfute_annotations,
    bradfute_orcid_to_paper,
    bradfute_ambiguous,
    bradfute_misses,
) = _process_pmid_results(bradfute_records)

print(
    f"There are {len(bradfute_annotations):,} paper-author annotations "
    f"with {len(bradfute_orcid_to_paper)} unique authors and {len(bradfute_pmids):,} "
    f"papers."
)

n_bradfute_annotated = len(bradfute_orcid_to_paper[BRADFUTE_ORCID])
print(
    f"There are {n_bradfute_annotated:,} ({n_bradfute_annotated/len(bradfute_pmids):.1%}) "
    f"papers annotated with Bradfute."
)

Looking up:   0%|          | 0/1 [00:00<?, ?batch of 200/s]

Grounding:   0%|          | 0.00/81.0 [00:00<?, ?it/s]

There are 967 paper-author annotations with 525 unique authors and 81 papers.
There are 76 (93.8%) papers annotated with Bradfute.


In [4]:
Markdown(pd.DataFrame(bradfute_misses.most_common()).to_markdown())

|    | 0                           |   1 |
|---:|:----------------------------|----:|
|  0 | Kelly S Stuthman            |   4 |
|  1 | William C Raschke           |   2 |
|  2 | Amy C Shurtleff             |   2 |
|  3 | Kurt C Schwalm              |   2 |
|  4 | Boris Julg                  |   2 |
|  5 | Vinciane Gaussin            |   1 |
|  6 | Joan B Geisbert             |   1 |
|  7 | Sean VanTongeren            |   1 |
|  8 | Warren V Kalina             |   1 |
|  9 | Sean A VanTongeren          |   1 |
| 10 | Cary Retterer               |   1 |
| 11 | Kevin B Spurgers            |   1 |
| 12 | Michael A Drebot            |   1 |
| 13 | Nicole Mielke-Ehret         |   1 |
| 14 | Hans-Peter Mühlbach         |   1 |
| 15 | Yukio Shirako               |   1 |
| 16 | Jana Širmarová              |   1 |
| 17 | David G Grenache            |   1 |
| 18 | Surekha Surendranathan      |   1 |
| 19 | Ivy Foo-Hurwitz             |   1 |
| 20 | Claudette Feuvrier          |   1 |
| 21 | François Lieffrig           |   1 |
| 22 | Laurane Pallandre           |   1 |
| 23 | Françoise Pozet             |   1 |
| 24 | Rachel Atchley-Challenner   |   1 |
| 25 | Urania Argueta              |   1 |
| 26 | Vivian Gainer               |   1 |
| 27 | Janko Z Nikolich            |   1 |
| 28 | Elizabeth Ojemakinde        |   1 |
| 29 | Venkidusamy Kavi Sidharthan |   1 |
| 30 | Jaecy K Banther-McConnell   |   1 |
| 31 | Dustin Arsnoe               |   1 |
| 32 | Diana Riner                 |   1 |
| 33 | Mary Grace Stobierski       |   1 |
| 34 | Janko Ž Nikolich            |   1 |

In [5]:
Markdown(pd.DataFrame(bradfute_ambiguous.most_common()).to_markdown())

|     | 0                        |   1 |
|----:|:-------------------------|----:|
|   0 | Chunyan Ye               |  17 |
|   1 | Elizabeth C Clarke       |   8 |
|   2 | Kartik Chandran          |   8 |
|   3 | John M Dye               |   5 |
|   4 | Scott M Anthony          |   5 |
|   5 | Douglas J Perkins        |   4 |
|   6 | Gustavo Palacios         |   4 |
|   7 | Natarajan Ayithan        |   3 |
|   8 | María A Ayllón           |   3 |
|   9 | Martin Beer              |   3 |
|  10 | Éric Bergeron            |   3 |
|  11 | Inmaculada Casas         |   3 |
|  12 | Ralf Dürrwald            |   3 |
|  13 | Andrew J Easton          |   3 |
|  14 | María Laura García       |   3 |
|  15 | Anthony Griffiths        |   3 |
|  16 | Stephan Günther          |   3 |
|  17 | John Hammond             |   3 |
|  18 | Holly R Hughes           |   3 |
|  19 | Amy J Lambert            |   3 |
|  20 | Jiànróng Lǐ              |   3 |
|  21 | Sergio H Marshall        |   3 |
|  22 | John W McCauley          |   3 |
|  23 | Rayapati Naidu           |   3 |
|  24 | José A Navarro           |   3 |
|  25 | Anna Papa                |   3 |
|  26 | Daniel R Pérez           |   3 |
|  27 | Florian Pfaff            |   3 |
|  28 | Renato O Resende         |   3 |
|  29 | Martin Schwemmle         |   3 |
|  30 | Jin-Won Song             |   3 |
|  31 | Nikos Vasilakis          |   3 |
|  32 | Peter J Walker           |   3 |
|  33 | Yong-Zhen Zhang          |   3 |
|  34 | Yan Guo                  |   3 |
|  35 | Joseph A Cook            |   3 |
|  36 | Lloyd H Michael          |   2 |
|  37 | Daniel K Reed            |   2 |
|  38 | Arthur O Anderson        |   2 |
|  39 | Siham Nakamura           |   2 |
|  40 | Krishna Kota             |   2 |
|  41 | Michael A Mandell        |   2 |
|  42 | Santosh Chauhan          |   2 |
|  43 | Russell R Bakken         |   2 |
|  44 | Eduardo Anaya            |   2 |
|  45 | Sarah Yarborough         |   2 |
|  46 | Christine Merle          |   2 |
|  47 | Manfred Theisen          |   2 |
|  48 | Ravi Durvasula           |   2 |
|  49 | Zachary R Stromberg      |   2 |
|  50 | Qiuying Cheng            |   2 |
|  51 | Gregory J Mertz          |   2 |
|  52 | Suresh Kumar             |   2 |
|  53 | Ivan V Kuzmin            |   2 |
|  54 | Beatriz Navarro          |   2 |
|  55 | Maria S Salvato          |   2 |
|  56 | John V Williams          |   2 |
|  57 | F Murilo Zerbini         |   2 |
|  58 | David S Peabody          |   2 |
|  59 | Michelle Harkins         |   2 |
|  60 | Andrew M Skidmore        |   2 |
|  61 | Laura C Polanco          |   2 |
|  62 | Alexandra Serris         |   2 |
|  63 | Markus Keller            |   2 |
|  64 | Felix A Rey              |   2 |
|  65 | Andrew S Herbert         |   2 |
|  66 | William M de Souza       |   2 |
|  67 | J Felix Drexler          |   2 |
|  68 | Guozhong Feng            |   2 |
|  69 | Seiji Hongō              |   2 |
|  70 | Kenji Kubota             |   2 |
|  71 | Jun-Min Li               |   2 |
|  72 | Yutaro Neriya            |   2 |
|  73 | Sofia Paraskevopoulou    |   2 |
|  74 | Benjamin Chen            |   2 |
|  75 | Upinder Singh            |   2 |
|  76 | Peter Chen               |   2 |
|  77 | Minjoung Go              |   2 |
|  78 | Carla Hernandez          |   2 |
|  79 | Rachel Hess              |   2 |
|  80 | Thomas F Patterson       |   2 |
|  81 | John G Quigley           |   2 |
|  82 | Dwight J Rouse           |   2 |
|  83 | Barbara S Taylor         |   2 |
|  84 | C Kim                    |   2 |
|  85 | Teresa D Gallardo        |   1 |
|  86 | Teruya Nakamura          |   1 |
|  87 | Xuan Chi                 |   1 |
|  88 | Robert J Schwartz        |   1 |
|  89 | Andrew J McCooey         |   1 |
|  90 | Carlos A Ramos           |   1 |
|  91 | Patricia Fonseca         |   1 |
|  92 | Alan Poindexter          |   1 |
|  93 | Daniela M Oliveira       |   1 |
|  94 | Rahshaana Green          |   1 |
|  95 | Yayun Zheng              |   1 |
|  96 | Kathyjo A Jackson        |   1 |
|  97 | David Steffen            |   1 |
|  98 | Jason Paragas            |   1 |
|  99 | Derron A Alves           |   1 |
| 100 | Kuan-Yin K Lin           |   1 |
| 101 | Megan P Tierney          |   1 |
| 102 | Alice J Chen             |   1 |
| 103 | Olga Sirin               |   1 |
| 104 | Mehveen G Merchant       |   1 |
| 105 | C Joseph Fisk            |   1 |
| 106 | Ricky L Ulrich           |   1 |
| 107 | Loreen Lofts             |   1 |
| 108 | Meagan T Cooper          |   1 |
| 109 | D Anthony Alves          |   1 |
| 110 | Christine A Mech         |   1 |
| 111 | Tsung-Hsien Chang        |   1 |
| 112 | Mayumi Matsuoka          |   1 |
| 113 | Steven Jones             |   1 |
| 114 | Mark A Smith             |   1 |
| 115 | Richard S Hotchkiss      |   1 |
| 116 | Jerome Jacques           |   1 |
| 117 | Sufi Morshed             |   1 |
| 118 | Steven C Wood            |   1 |
| 119 | Xiaoli Chi               |   1 |
| 120 | Lian Dong                |   1 |
| 121 | Jacqueline D Gearhart    |   1 |
| 122 | John N Misasi            |   1 |
| 123 | James M Cunningham       |   1 |
| 124 | M Javad Aman             |   1 |
| 125 | Esteban Roberts          |   1 |
| 126 | Sharon Master            |   1 |
| 127 | Nicolas Dupont           |   1 |
| 128 | Tom Egil Hansen          |   1 |
| 129 | Michael Mandell          |   1 |
| 130 | Prafullakumar Tailor     |   1 |
| 131 | Diana Fisher             |   1 |
| 132 | Cathleen M Lind          |   1 |
| 133 | Jeffrey W Cohen          |   1 |
| 134 | Radha K Maheshwari       |   1 |
| 135 | Zahra Ahmed              |   1 |
| 136 | Tomonori Kimura          |   1 |
| 137 | Anna Waller              |   1 |
| 138 | Kiran Bhaskar            |   1 |
| 139 | Britney Martinez         |   1 |
| 140 | Yangsheng Yu             |   1 |
| 141 | Christopher L Cooper     |   1 |
| 142 | Guangshun Wang           |   1 |
| 143 | Yan Liu                  |   1 |
| 144 | Amanda K Zhang           |   1 |
| 145 | Lindsey G Luo            |   1 |
| 146 | St Patrick Reid          |   1 |
| 147 | Jay S Raval              |   1 |
| 148 | Michelle S Harkins       |   1 |
| 149 | Simon J Anthony          |   1 |
| 150 | Tomáš Bartonička         |   1 |
| 151 | Brian H Bird             |   1 |
| 152 | Carol Blair              |   1 |
| 153 | John Chamberlain         |   1 |
| 154 | Biao Chen                |   1 |
| 155 | J Christopher S Clegg    |   1 |
| 156 | Ian Crozier              |   1 |
| 157 | Patrick L Di Bello       |   1 |
| 158 | Mohamed Hassan           |   1 |
| 159 | Bernd Hoffmann           |   1 |
| 160 | Seiji Hongo              |   1 |
| 161 | Rodrigo Jardim           |   1 |
| 162 | Qi Jin                   |   1 |
| 163 | Serpil Karadağ           |   1 |
| 164 | Karen E Keller           |   1 |
| 165 | Robert A Lamb            |   1 |
| 166 | Elba R S Lemos           |   1 |
| 167 | Eric M Leroy             |   1 |
| 168 | Dexin Li                 |   1 |
| 169 | Mifang Liang             |   1 |
| 170 | Wénwén Liú               |   1 |
| 171 | Yàn Liú                  |   1 |
| 172 | William Marciel de Souza |   1 |
| 173 | Giovanni P Martelli      |   1 |
| 174 | Robert R Martin          |   1 |
| 175 | Sébastien Massart        |   1 |
| 176 | Renata C Oliveira        |   1 |
| 177 | Susan Payne              |   1 |
| 178 | Bertus K Rima            |   1 |
| 179 | Amadou Alpha Sall        |   1 |
| 180 | Muhammad Z Shabbir       |   1 |
| 181 | Xiǎohóng Shí             |   1 |
| 182 | Zhènglì Shí              |   1 |
| 183 | Peter Simmonds           |   1 |
| 184 | David M Stone            |   1 |
| 185 | Petra Straková           |   1 |
| 186 | Hui Wang                 |   1 |
| 187 | Jianwei Wang             |   1 |
| 188 | Xifeng Wang              |   1 |
| 189 | Lin-Fa Wang              |   1 |
| 190 | Tàiyún Wèi               |   1 |
| 191 | Heather Wells            |   1 |
| 192 | Zhìqiáng Wú              |   1 |
| 193 | Xin Yang                 |   1 |
| 194 | Xuejie Yu                |   1 |
| 195 | Tong Zhang               |   1 |
| 196 | Guohui Zhou              |   1 |
| 197 | Xueping Zhou             |   1 |
| 198 | Matthew J Garcia         |   1 |
| 199 | Shruti Bansal            |   1 |
| 200 | M Lisa Phipps            |   1 |
| 201 | Chris J Sheehan          |   1 |
| 202 | Jennifer S Martinez      |   1 |
| 203 | Maria Eugenia Dieterle   |   1 |
| 204 | Ruheena Javed            |   1 |
| 205 | Ashish Jain              |   1 |
| 206 | Karthikeyan Tangavelou   |   1 |
| 207 | Jing Li                  |   1 |
| 208 | Janae Martinez           |   1 |
| 209 | Shuguang Leng            |   1 |
| 210 | Jeremy Edwards           |   1 |
| 211 | Gregory Mertz            |   1 |
| 212 | Mark Unruh               |   1 |
| 213 | J Pedro Teixeira         |   1 |
| 214 | Rémy Bruggmann           |   1 |
| 215 | Mark Crane               |   1 |
| 216 | Hidenori Horikawa        |   1 |
| 217 | Dongsheng Luo            |   1 |
| 218 | Peter G Mohr             |   1 |
| 219 | Richard N Morrison       |   1 |
| 220 | Kenta Tsunekawa          |   1 |
| 221 | Junki Yamasaki           |   1 |
| 222 | Stephen R Welch          |   1 |
| 223 | Jana M Ritter            |   1 |
| 224 | Katherine A Davies       |   1 |
| 225 | Alexandra Fowler         |   1 |
| 226 | Christian R Gomez        |   1 |
| 227 | Thomas J Connors         |   1 |
| 228 | William Brian Reeves     |   1 |
| 229 | Michael A Portman        |   1 |
| 230 | Marila Gennaro           |   1 |
| 231 | Sarah E Jolley           |   1 |
| 232 | Bruce Levy               |   1 |
| 233 | Tiffany A Walker         |   1 |
| 234 | Carolin C M Schulte      |   1 |
| 235 | George A Alba            |   1 |
| 236 | Shamik Bhattacharyya     |   1 |
| 237 | Hector Bonilla           |   1 |
| 238 | Mario Castro             |   1 |
| 239 | James Chan               |   1 |
| 240 | Rebecca G Clifton        |   1 |
| 241 | Vivian Fonseca           |   1 |
| 242 | Jenny E Han              |   1 |
| 243 | James Heath              |   1 |
| 244 | Susan E Hoover           |   1 |
| 245 | Beatrice Huang           |   1 |
| 246 | Brenna L Hughes          |   1 |
| 247 | Janice John              |   1 |
| 248 | Michael R Jordan         |   1 |
| 249 | John D Kelly             |   1 |
| 250 | Jonathan D Klein         |   1 |
| 251 | Allison A Lambert        |   1 |
| 252 | Michele T Longo          |   1 |
| 253 | Carlos A Luciano         |   1 |
| 254 | Jason H Maley            |   1 |
| 255 | Hector Mendez-Figueroa   |   1 |
| 256 | Shawn N Murphy           |   1 |
| 257 | Robert B Neuman          |   1 |
| 258 | Anna Palatnik            |   1 |
| 259 | Samuel Parry             |   1 |
| 260 | Uma Reddy                |   1 |
| 261 | Rebecca Reece            |   1 |
| 262 | W B Reeves               |   1 |
| 263 | Jeffrey A Sparks         |   1 |
| 264 | John M Thorp             |   1 |
| 265 | Katherine R Tuttle       |   1 |
| 266 | Zachary S Wallace        |   1 |
| 267 | Steven J Weiner          |   1 |
| 268 | Lynn M Yee               |   1 |
| 269 | Yan Sun                  |   1 |
| 270 | Russel R Bakken          |   1 |
| 271 | Junya Abe                |   1 |
| 272 | Kar Mun Chooi            |   1 |
| 273 | Wenjie Gong              |   1 |
| 274 | JoëlleGoüy de Bellocq    |   1 |
| 275 | Ines Günther             |   1 |
| 276 | Yusuke Hasegawa          |   1 |
| 277 | Colleen M Higgins        |   1 |
| 278 | Kenichi Ikeda            |   1 |
| 279 | Ran Liu                  |   1 |
| 280 | Tomoyuki Okada           |   1 |
| 281 | Akio Tatara              |   1 |
| 282 | Xin Tian                 |   1 |
| 283 | Han Xia                  |   1 |
| 284 | Kazutaka Yano            |   1 |
| 285 | Zhiming Yuan             |   1 |
| 286 | Guilin Zhang             |   1 |
| 287 | Song Zhang               |   1 |
| 288 | Lu Zhao                  |   1 |
| 289 | K Coombs                 |   1 |
| 290 | William L Johnson        |   1 |
| 291 | Ivana Mali               |   1 |
| 292 | Seonghyeon Lee           |   1 |
| 293 | Jieun Park               |   1 |
| 294 | Won-Keun Kim             |   1 |
| 295 | Trevor Shoemaker         |   1 |
| 296 | Janet Y Lin              |   1 |
| 297 | Brittany D Taylor        |   1 |
| 298 | Elizabeth R Duffy        |   1 |
| 299 | Jenny Han                |   1 |
| 300 | J Daniel Kelly           |   1 |
| 301 | Jeremy Wood              |   1 |
| 302 | David Warren             |   1 |