In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'
max_workers = 10
max_editors = 1000

In [2]:
import os
import logging
from functools import partial
from concurrent.futures import ThreadPoolExecutor 
from typing import List

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.europepmc import (
    EuropePMCApi,
    europepmc_requests_retry_session
)
from data_science_pipeline.utils.bq import load_file_and_replace_bq_table_with_auto_schema
from data_science_pipeline.utils.pandas import to_csv
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

In [4]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

editor_pubmed_ids_path = os.path.join(state_path, 'editor_pubmed_ids.tsv.gz')

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [7]:
editor_parsed_pubmed_links_df = read_big_query(
    'SELECT * FROM `{table_name}`'.format(
        table_name=editor_parsed_pubmed_links_table_name
    )
)
print(len(editor_parsed_pubmed_links_df))
editor_parsed_pubmed_links_df.head()

> ```sql
> SELECT * FROM `de_dev.data_science_editor_pubmed_links`
> ```

Downloading: 100%|██████████| 616/616 [00:00<00:00, 641.66rows/s]

616





Unnamed: 0,name,parsed_search_term,person_id,pubmed_url,resolved_pubmed_url,search_term
0,Rebecca Seal,,12445,http://seallab.neurobio.pitt.edu/https://www.n...,https://seallab.neurobio.pitt.edu/https://www....,
1,Phillip Newmark,,4633,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,
2,Ashish Lal,,73888,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,
3,Cynthia Wolberger,,5005,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,
4,Anna Diehl,,143521,https://www.google.com/search?safe=vss&sa=Goog...,https://www.google.com/search?safe=vss&sa=Goog...,


In [8]:
list(editor_parsed_pubmed_links_df[:3].itertuples())

[Pandas(Index=0, name='Rebecca Seal', parsed_search_term=None, person_id='12445', pubmed_url='http://seallab.neurobio.pitt.edu/https://www.ncbi.nlm.nih.gov/pubmed/?term=Seal%20RP%5BAuthor%5D&cauthor=true&cauthor_uid=30359601', resolved_pubmed_url='https://seallab.neurobio.pitt.edu/https://www.ncbi.nlm.nih.gov/pubmed/?term=Seal%20RP%5BAuthor%5D&cauthor=true&cauthor_uid=30359601', search_term=None),
 Pandas(Index=1, name='Phillip Newmark', parsed_search_term=None, person_id='4633', pubmed_url='https://www.ncbi.nlm.nih.gov/myncbi/browse/collection/40388473/?sort=date&direction=descending', resolved_pubmed_url='https://www.ncbi.nlm.nih.gov/myncbi/browse/collection/40388473/?sort=date&direction=descending', search_term=None),
 Pandas(Index=2, name='Ashish Lal', parsed_search_term=None, person_id='73888', pubmed_url='https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgrv1iQ78gk8/bibliography/48627287/public/?sort=date&direction=descending', resolved_pubmed_url='https://www.ncbi.nlm.nih.gov/sites/my

In [9]:
list(editor_parsed_pubmed_links_df[:1].itertuples())[0].parsed_search_term

In [10]:
def get_editor_pubmed_paper_ids(europepmc_api: EuropePMCApi, row) -> List[str]:
    parsed_search_term = row.parsed_search_term
    if not parsed_search_term:
        return None
    try:
        author_names = parsed_search_term.get('include', {}).get('author')
        return europepmc_api.get_author_pmids(author_names)
    except:
        LOGGER.error('failed to convert pubmed_url: %s', pubmed_url, exc_info=1)
        return None


if max_editors:
    editor_pubmed_links_result_df = editor_parsed_pubmed_links_df[:max_editors].copy()
else:
    editor_pubmed_links_result_df = editor_parsed_pubmed_links_df

with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(session)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        editor_pubmed_links_result_df['pubmed_ids'] = list(tqdm(
            executor.map(
                lambda row: get_editor_pubmed_paper_ids(europepmc_api, row),
                editor_pubmed_links_result_df.itertuples()
            ),
            total=len(editor_pubmed_links_result_df),
            leave=False
        ))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In [11]:
editor_pubmed_links_formatted_df = editor_pubmed_links_result_df.copy()
editor_pubmed_links_formatted_df['pubmed_ids'] = editor_pubmed_links_formatted_df['pubmed_ids'].str.join('|')
print(len(editor_pubmed_links_formatted_df))
editor_pubmed_links_formatted_df.head()

10


Unnamed: 0,name,parsed_search_term,person_id,pubmed_url,resolved_pubmed_url,search_term,pubmed_ids
0,Rebecca Seal,,12445,http://seallab.neurobio.pitt.edu/https://www.n...,https://seallab.neurobio.pitt.edu/https://www....,,
1,Phillip Newmark,,4633,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,,
2,Ashish Lal,,73888,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,,
3,Cynthia Wolberger,,5005,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,,
4,Anna Diehl,,143521,https://www.google.com/search?safe=vss&sa=Goog...,https://www.google.com/search?safe=vss&sa=Goog...,,


In [12]:
print('saving to: %s' % editor_pubmed_ids_path)
to_csv(editor_pubmed_links_formatted_df, editor_pubmed_ids_path)
print('done')

saving to: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/editor_pubmed_ids.tsv.gz
done
