In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'
max_workers = 10
max_editors = 10

In [2]:
import json
import re
import os
import logging
from functools import partial
from concurrent.futures import ThreadPoolExecutor 
from typing import List

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.pubmed import (
    normalize_url,
    resolve_url_if_not_ncbi_domain,
    is_ncbi_search_url,
    get_ncbi_search_term,
    parse_term_query
)
from data_science_pipeline.utils.europepmc import (
    EuropePMCApi,
    europepmc_requests_retry_session
)
# from data_science_pipeline.utils.io import serialize_object_to
from data_science_pipeline.utils.pandas import to_csv
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

In [4]:
editor_pubmed_ids_path = os.path.join(state_path, 'editor_pubmed_ids.tsv.gz')

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [7]:
editor_pubmed_links_df = read_big_query(
    get_sql('editor-pubmed-links.sql').format(
        **default_query_props
    )
)
print(len(editor_pubmed_links_df))
editor_pubmed_links_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name,
>   Pubmed_URL AS pubmed_url
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile` AS Editor
> ```

Downloading: 100%|██████████| 616/616 [00:00<00:00, 753.46rows/s]

616





Unnamed: 0,person_id,name,pubmed_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...
4,7970,Laurent Keller,http://www.ncbi.nlm.nih.gov/pubmed/?term=Kelle...


In [8]:
editor_pubmed_links_df['pubmed_url'] = editor_pubmed_links_df['pubmed_url'].apply(
    lambda pubmed_url: resolve_url_if_not_ncbi_domain(normalize_url(pubmed_url))
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...
3,14193,Merijn Kant,https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...
4,7970,Laurent Keller,https://www.ncbi.nlm.nih.gov/pubmed/?term=Kell...


In [9]:
def get_editor_pubmed_paper_ids(europepmc_api: EuropePMCApi, pubmed_url: str) -> List[str]:
    if not is_ncbi_search_url(pubmed_url):
        return None
    try:
        search_term = get_ncbi_search_term(pubmed_url)
        parsed_term_query = parse_term_query(search_term)
        author_names = parsed_term_query.get('include', {}).get('author')
        return europepmc_api.get_author_pmids(author_names)
    except:
        LOGGER.error('failed to convert pubmed_url: %s', pubmed_url, exc_info=1)
        return None


if max_editors:
    editor_pubmed_links_result_df = editor_pubmed_links_df[:max_editors].copy()
else:
    editor_pubmed_links_result_df = editor_pubmed_links_df

with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(session)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        editor_pubmed_links_result_df['pubmed_ids'] = list(tqdm(
            executor.map(
                lambda pubmed_url: get_editor_pubmed_paper_ids(europepmc_api, pubmed_url),
                editor_pubmed_links_result_df['pubmed_url'].values
            ),
            total=len(editor_pubmed_links_result_df),
            leave=False
        ))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In [10]:
editor_pubmed_links_result_df.head()

Unnamed: 0,person_id,name,pubmed_url,pubmed_ids
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[31778207, 31187501, 31340216, 30777873, 30232..."
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[32246110, 31764994, 30659607, 30956931]"
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[32486982, 32504789, 32427483, 32407371, 32333..."
3,14193,Merijn Kant,https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,"[26019168, 23238958, 11256409]"
4,7970,Laurent Keller,https://www.ncbi.nlm.nih.gov/pubmed/?term=Kell...,"[32353492, 32415359, 32271631, 32298574, 32244..."


In [11]:
print('saving to: %s' % editor_pubmed_ids_path)
to_csv(editor_pubmed_links_result_df, editor_pubmed_ids_path)
print('done')

saving to: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/editor_pubmed_ids.tsv.gz
done
