In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'
max_workers = 10
max_editors = 100
email = 'd.ecer@elifesciences.org'

In [2]:
import os
import logging
from datetime import datetime
from functools import partial
from concurrent.futures import ThreadPoolExecutor 
from typing import List

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.europepmc import (
    EuropePMCApi,
    europepmc_requests_retry_session
)
from data_science_pipeline.utils.bq import to_gbq, is_bq_not_found_exception
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

logging.basicConfig(level='INFO')
logging.getLogger('data_science_pipeline').setLevel(logging.INFO)

In [4]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

editor_parsed_pubmed_ids_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_ids'
)

editor_pubmed_ids_path = os.path.join(state_path, 'editor_pubmed_ids.tsv.gz')

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [7]:
_sql = (
    'SELECT person_id FROM `{table}`'
).format(table=editor_parsed_pubmed_ids_table_name)

try:
    existing_editor_ids_df = read_big_query(_sql)
except Exception as e:
    if not is_bq_not_found_exception(e):
        raise
    print('table not found: %s' % editor_parsed_pubmed_ids_table_name)
    existing_editor_ids_df = pd.DataFrame(columns=['person_id'])
print(len(existing_editor_ids_df))
existing_editor_ids_df.head()

> ```sql
> SELECT person_id FROM `de_dev.data_science_editor_pubmed_ids`
> ```

table not found: de_dev.data_science_editor_pubmed_ids
0


Unnamed: 0,person_id


In [8]:
existing_editor_ids_set = set(existing_editor_ids_df['person_id'])
len(existing_editor_ids_set)

0

In [9]:
editor_parsed_pubmed_links_df = read_big_query(
    'SELECT * FROM `{table_name}`'.format(
        table_name=editor_parsed_pubmed_links_table_name
    )
)
print(len(editor_parsed_pubmed_links_df))
editor_parsed_pubmed_links_df.head()

> ```sql
> SELECT * FROM `de_dev.data_science_editor_pubmed_links`
> ```

Downloading: 100%|██████████| 616/616 [00:00<00:00, 636.68rows/s]

616





Unnamed: 0,name,parsed_search_term,person_id,pubmed_url,resolved_pubmed_url,search_term
0,Rebecca Seal,,12445,http://seallab.neurobio.pitt.edu/https://www.n...,https://seallab.neurobio.pitt.edu/https://www....,
1,Phillip Newmark,,4633,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,
2,Ashish Lal,,73888,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,
3,Cynthia Wolberger,,5005,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,https://www.ncbi.nlm.nih.gov/myncbi/cynthia.wo...,
4,Anna Diehl,,143521,https://www.google.com/search?safe=vss&sa=Goog...,https://www.google.com/search?safe=vss&sa=Goog...,


In [10]:
remaining_editor_parsed_pubmed_links_df = editor_parsed_pubmed_links_df[
    ~editor_parsed_pubmed_links_df['person_id'].isin(existing_editor_ids_set)
]
print(len(remaining_editor_parsed_pubmed_links_df))
remaining_editor_parsed_pubmed_links_df.head(3)

616


Unnamed: 0,name,parsed_search_term,person_id,pubmed_url,resolved_pubmed_url,search_term
0,Rebecca Seal,,12445,http://seallab.neurobio.pitt.edu/https://www.n...,https://seallab.neurobio.pitt.edu/https://www....,
1,Phillip Newmark,,4633,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,https://www.ncbi.nlm.nih.gov/myncbi/browse/col...,
2,Ashish Lal,,73888,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,https://www.ncbi.nlm.nih.gov/sites/myncbi/1lgr...,


In [11]:
processing_editor_parsed_pubmed_links_df = remaining_editor_parsed_pubmed_links_df
if max_editors:
    processing_editor_parsed_pubmed_links_df = processing_editor_parsed_pubmed_links_df[:max_editors]
len(processing_editor_parsed_pubmed_links_df)

100

In [12]:
def handle_http_error(error: BaseException, data: dict = None):
    LOGGER.warning('error: %s, data=%s', error, data)


def get_editor_pubmed_paper_ids(europepmc_api: EuropePMCApi, row) -> List[str]:
    parsed_search_term = row.parsed_search_term
    if not parsed_search_term:
        return None
    try:
        author_names = parsed_search_term.get('include', {}).get('author')
        return europepmc_api.get_author_pmids(author_names)
    except:
        LOGGER.error('failed to convert pubmed_url: %s', pubmed_url, exc_info=1)
        return None


editor_pubmed_links_result_df = processing_editor_parsed_pubmed_links_df[:max_editors].copy()

with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(
        session,
        on_error=handle_http_error,
        params={'email': email}
    )
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        editor_pubmed_links_result_df['pubmed_ids'] = list(tqdm(
            executor.map(
                lambda row: get_editor_pubmed_paper_ids(europepmc_api, row),
                editor_pubmed_links_result_df.itertuples()
            ),
            total=len(editor_pubmed_links_result_df),
            leave=False
        ))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

In [14]:
editor_pubmed_links_result_df['provenance'] = [{
    'source': 'europepmc',
    'imported_timestamp': datetime.utcnow().isoformat()
}] * len(editor_pubmed_links_result_df)

In [15]:
if len(editor_pubmed_links_result_df) == 0:
    print('no data to upload')
else:
    print('writing to:', editor_parsed_pubmed_ids_table_name)
    to_gbq(
        editor_pubmed_links_result_df,
        project_id=project_id,
        destination_table=editor_parsed_pubmed_ids_table_name,
        if_exists='append'
    )
    print('done')

INFO:root:Processed 100 lines


writing to: de_dev.data_science_editor_pubmed_ids


INFO:data_science_pipeline.utils.bq_schema:Created table elife-data-pipeline.de_dev.data_science_editor_pubmed_ids
INFO:data_science_pipeline.utils.bq:loading from /tmp/tmpuk655bsn/data.jsonl
INFO:data_science_pipeline.utils.bq:Loaded 100 rows into de_dev:data_science_editor_pubmed_ids.


done
