In [22]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'

max_workers = 10
max_editors = 100
email = 'd.ecer@elifesciences.org'

In [36]:
import logging
from datetime import datetime
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from typing import List

import pandas as pd
from tqdm.auto import tqdm
import pytz

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.utils.europepmc import (
    EuropePMCApi,
    europepmc_requests_retry_session
)
from data_science_pipeline.utils.bq import to_gbq, is_bq_not_found_exception
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [24]:
LOGGER = logging.getLogger(__name__)

logging.basicConfig(level='INFO')
logging.getLogger('data_science_pipeline').setLevel(logging.INFO)

In [25]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

editor_parsed_pubmed_ids_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_ids'
)

In [26]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [27]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [28]:
_sql = (
    '''
    SELECT
        person_id,
        pubmed_url,
        imported_timestamp
    FROM (
        SELECT
            person_id,
            pubmed_url,
            provenance.imported_timestamp as imported_timestamp,
            ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY provenance.imported_timestamp DESC) as rn
        FROM `{table}`
    ) WHERE rn = 1
    ''').format(table=editor_parsed_pubmed_ids_table_name)

try:
    existing_editor_ids_and_pubmed_url_df = read_big_query(_sql)
except Exception as e:  # pylint: disable=broad-except
    if not is_bq_not_found_exception(e):
        raise
    print('table not found: %s' % editor_parsed_pubmed_ids_table_name)
    existing_editor_ids_and_pubmed_url_df = pd.DataFrame(columns=['person_id', 'pubmed_url', 'imported_timestamp'])
print("existing_editor_ids_and_pubmed_url_df length: ", len(existing_editor_ids_and_pubmed_url_df))
existing_editor_ids_and_pubmed_url_df.head(3)

> ```sql
> 
>     SELECT
>         person_id,
>         pubmed_url,
>         imported_timestamp
>     FROM (
>         SELECT
>             person_id,
>             pubmed_url,
>             provenance.imported_timestamp as imported_timestamp,
>             ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY provenance.imported_timestamp DESC) as rn
>         FROM `de_dev.data_science_editor_pubmed_ids`
>     ) WHERE rn = 1
>     
> ```

existing_editor_ids_and_pubmed_url_df length:  774


  df[column] = pandas.Series(df[column], dtype=dtypes[column])


Unnamed: 0,person_id,pubmed_url,imported_timestamp
0,187435,https://www.ncbi.nlm.nih.gov/pubmed/?term=Chin...,2021-09-15 08:18:22.210420+00:00
1,63858,https://www.ncbi.nlm.nih.gov/pubmed/?term=Hela...,2021-09-15 08:18:22.210420+00:00
2,1152,http://www.ncbi.nlm.nih.gov/pubmed?term=N%C3%B...,2021-09-15 08:18:22.210420+00:00


In [29]:
existing_editor_ids_set = set(existing_editor_ids_and_pubmed_url_df['person_id'])
print("existing_editor_ids_set length :", len(existing_editor_ids_set))

existing_editor_ids_set length : 774


In [30]:
editor_parsed_pubmed_links_df = read_big_query(
    'SELECT * FROM `{table_name}`\nWHERE parsed_search_term IS NOT NULL'.format(
        table_name=editor_parsed_pubmed_links_table_name
    )
)
print("editor_parsed_pubmed_links_df length: ", len(editor_parsed_pubmed_links_df))
editor_parsed_pubmed_links_df.head(3)

> ```sql
> SELECT * FROM `de_dev.data_science_editor_pubmed_links`
> WHERE parsed_search_term IS NOT NULL
> ```

editor_parsed_pubmed_links_df length:  763


Unnamed: 0,is_ncbi_bibliography_url,name,parsed_search_term,person_id,pubmed_url,relevant_pubmed_ids,relevant_pubmed_urls,resolved_pubmed_url,search_term
0,False,Barnabas Daru,"{'exclude': None, 'include': {'author': ['Daru...",187442,https://pubmed.ncbi.nlm.nih.gov/?term=Daru+BH&...,"[32355257, 29083043, 30455213, 28919204]","[https://www.ncbi.nlm.nih.gov/pubmed/32355257,...",https://pubmed.ncbi.nlm.nih.gov/?term=Daru+BH&...,Daru BH
1,False,Yuuki Watanabe,"{'exclude': None, 'include': {'author': ['Wata...",126873,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[25902489, 23341596, 20946384]","[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author]
2,False,Safi Darden,"{'exclude': None, 'include': {'author': ['Dard...",43038,https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,"[32900316, 32531279, 31748621, 30800389]","[https://www.ncbi.nlm.nih.gov/pubmed/32900316,...",https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,Darden SK


In [31]:
merged_editor_parsed_pubmed_links_df = editor_parsed_pubmed_links_df.merge(
    existing_editor_ids_and_pubmed_url_df,
    how='left',
    on='person_id',
    suffixes=('', '_existing')
)

print("merged_editor_parsed_pubmed_links_df length: ", len(merged_editor_parsed_pubmed_links_df))
merged_editor_parsed_pubmed_links_df.head(3)

merged_editor_parsed_pubmed_links_df length:  763


Unnamed: 0,is_ncbi_bibliography_url,name,parsed_search_term,person_id,pubmed_url,relevant_pubmed_ids,relevant_pubmed_urls,resolved_pubmed_url,search_term,pubmed_url_existing,imported_timestamp
0,False,Barnabas Daru,"{'exclude': None, 'include': {'author': ['Daru...",187442,https://pubmed.ncbi.nlm.nih.gov/?term=Daru+BH&...,"[32355257, 29083043, 30455213, 28919204]","[https://www.ncbi.nlm.nih.gov/pubmed/32355257,...",https://pubmed.ncbi.nlm.nih.gov/?term=Daru+BH&...,Daru BH,https://pubmed.ncbi.nlm.nih.gov/?term=Daru+BH&...,2021-09-15 08:18:22.210420+00:00
1,False,Yuuki Watanabe,"{'exclude': None, 'include': {'author': ['Wata...",126873,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[25902489, 23341596, 20946384]","[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author],https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,2021-09-15 08:18:22.210420+00:00
2,False,Safi Darden,"{'exclude': None, 'include': {'author': ['Dard...",43038,https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,"[32900316, 32531279, 31748621, 30800389]","[https://www.ncbi.nlm.nih.gov/pubmed/32900316,...",https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,Darden SK,https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,2021-09-15 08:18:22.210420+00:00


In [32]:
editors_with_changed_pubmed_url_df = merged_editor_parsed_pubmed_links_df[
    (merged_editor_parsed_pubmed_links_df['pubmed_url_existing'].notnull())
    &
    (
        merged_editor_parsed_pubmed_links_df['pubmed_url']
        !=
        merged_editor_parsed_pubmed_links_df['pubmed_url_existing']
    )
].drop(columns=['pubmed_url_existing', 'imported_timestamp'])

print("editors_with_changed_pubmed_url_df length: ", len(editors_with_changed_pubmed_url_df))
editors_with_changed_pubmed_url_df.head(3)

editors_with_changed_pubmed_url_df length:  7


Unnamed: 0,is_ncbi_bibliography_url,name,parsed_search_term,person_id,pubmed_url,relevant_pubmed_ids,relevant_pubmed_urls,resolved_pubmed_url,search_term
756,False,Rebekah Gundry,"{'exclude': None, 'include': {'author': ['Gund...",,https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,"[25068131, 31972267, 32053146, 30686762]","[https://www.ncbi.nlm.nih.gov/pubmed/25068131,...",https://pubmed.ncbi.nlm.nih.gov/?size=200&term...,Gundry RL
757,False,Joanne Lemieux,"{'exclude': None, 'include': {'author': ['Lemi...",,https://pubmed.ncbi.nlm.nih.gov/?term=Lemieux+...,"[25009246, 30420764, 28827454, 17210913, 12893...","[https://pubmed.ncbi.nlm.nih.gov/25009246, htt...",https://pubmed.ncbi.nlm.nih.gov/?term=Lemieux+...,Lemieux MJ
758,False,C Brandon Ogbunugafor,"{'exclude': None, 'include': {'author': ['Ogbu...",,https://pubmed.ncbi.nlm.nih.gov/?term=Ogbunuga...,"[31015194, 28812552, 26808374, 23594543]","[https://www.ncbi.nlm.nih.gov/pubmed/31015194,...",https://pubmed.ncbi.nlm.nih.gov/?term=Ogbunuga...,Ogbunugafor CB


In [37]:
editors_with_not_currently_updated_info_df = merged_editor_parsed_pubmed_links_df[
    (
        pd.Timestamp.now(tz=pytz.UTC)
        -
        pd.to_datetime(merged_editor_parsed_pubmed_links_df['imported_timestamp'])).dt.days > 15
]

print("editors_with_not_currently_updated_info length: ", len(editors_with_not_currently_updated_info_df))
editors_with_not_currently_updated_info_df.head(3)

editors_with_not_currently_updated_info length:  8


Unnamed: 0,is_ncbi_bibliography_url,name,parsed_search_term,person_id,pubmed_url,relevant_pubmed_ids,relevant_pubmed_urls,resolved_pubmed_url,search_term,pubmed_url_existing,imported_timestamp
4,False,Yuxin Chen,"{'exclude': None, 'include': {'author': ['Chen...",90531,https://pubmed.ncbi.nlm.nih.gov/?term=Chen+Y&c...,"[32123320, 30287660, 27197403, 29618550]","[https://www.ncbi.nlm.nih.gov/pubmed/32123320,...",https://pubmed.ncbi.nlm.nih.gov/?term=Chen+Y&c...,Chen Y,https://pubmed.ncbi.nlm.nih.gov/?term=Chen+Y&c...,2021-08-20 15:25:00.233909+00:00
245,False,Hang Zhang,"{'exclude': None, 'include': {'author': ['Zhan...",134740,https://pubmed.ncbi.nlm.nih.gov/?term=Zhang+H&...,"[33674418, 32843344, 32253359, 31086374, 31790...","[https://pubmed.ncbi.nlm.nih.gov/33674418/, ht...",https://pubmed.ncbi.nlm.nih.gov/?term=Zhang+H&...,Zhang H,https://pubmed.ncbi.nlm.nih.gov/?term=Zhang+H&...,2021-08-20 15:25:00.233909+00:00
265,False,Caigang Liu,"{'exclude': None, 'include': {'author': ['Liu ...",156274,https://pubmed.ncbi.nlm.nih.gov/?sort=date&siz...,"[23401456, 30509973, 28674022]","[https://pubmed.ncbi.nlm.nih.gov/23401456, htt...",https://pubmed.ncbi.nlm.nih.gov/?sort=date&siz...,Liu C,https://pubmed.ncbi.nlm.nih.gov/?sort=date&siz...,2021-08-20 15:25:00.233909+00:00


In [None]:
new_added_editors_df = editor_parsed_pubmed_links_df[
    ~editor_parsed_pubmed_links_df['person_id'].isin(existing_editor_ids_set)
]

print("new_added_editors_df length: ", len(new_added_editors_df))
new_added_editors_df.head(3)

In [None]:
remaining_editor_parsed_pubmed_links_df = pd.concat([
    new_added_editors_df,
    editors_with_changed_pubmed_url_df,
    editors_with_not_currently_updated_info_df
])

print("remaining_editor_parsed_pubmed_links_df length: ", len(remaining_editor_parsed_pubmed_links_df))
remaining_editor_parsed_pubmed_links_df.head(3)

In [None]:
processing_editor_parsed_pubmed_links_df = remaining_editor_parsed_pubmed_links_df
if max_editors:
    processing_editor_parsed_pubmed_links_df = processing_editor_parsed_pubmed_links_df[:max_editors]
len(processing_editor_parsed_pubmed_links_df)

In [None]:
def get_editor_pubmed_paper_ids(europepmc_api: EuropePMCApi, row) -> List[str]:
    parsed_search_term = row.parsed_search_term
    if not parsed_search_term:
        return None
    author_names = parsed_search_term.get('include', {}).get('author')
    try:
        author_names = parsed_search_term.get('include', {}).get('author')
        return europepmc_api.get_author_pmids(author_names)
    except:  # pylint: disable=bare-except
        LOGGER.error('failed to retrieve pubmed ids for author names: %s', author_names, exc_info=1)
        return None


editor_pubmed_links_result_df = processing_editor_parsed_pubmed_links_df[:max_editors].copy()

with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(
        session,
        params={'email': email}
    )
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        editor_pubmed_links_result_df['pubmed_ids'] = list(tqdm(
            executor.map(
                lambda row: get_editor_pubmed_paper_ids(europepmc_api, row),
                editor_pubmed_links_result_df.itertuples()
            ),
            total=len(editor_pubmed_links_result_df),
            leave=False
        ))

In [None]:
non_empty_editor_pubmed_links_result_df = editor_pubmed_links_result_df[
    ~pd.isnull(editor_pubmed_links_result_df['pubmed_ids'])
].copy()
len(non_empty_editor_pubmed_links_result_df)
non_empty_editor_pubmed_links_result_df.head(3)

In [None]:
non_empty_editor_pubmed_links_result_df['provenance'] = [{
    'source': 'europepmc',
    'imported_timestamp': datetime.utcnow().isoformat()
}] * len(non_empty_editor_pubmed_links_result_df)

In [None]:
if len(non_empty_editor_pubmed_links_result_df) == 0:
    print('no data to upload')
else:
    print('writing to:', editor_parsed_pubmed_ids_table_name)
    to_gbq(
        non_empty_editor_pubmed_links_result_df,
        project_id=project_id,
        destination_table=editor_parsed_pubmed_ids_table_name,
        if_exists='append'
    )
    print('done')