In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'

In [2]:
import logging
from functools import partial

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.pubmed import (
    normalize_url,
    resolve_url_if_not_ncbi_domain,
    get_ncbi_pubmed_article_ids,
    is_ncbi_search_url,
    is_ncbi_bibliography_url,
    get_ncbi_search_term,
    parse_term_query
)
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.pandas import apply_skip_null
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

In [4]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [7]:
editor_pubmed_links_df = read_big_query(
    get_sql('editor-pubmed-links.sql').format(
        **default_query_props
    )
)
print(len(editor_pubmed_links_df))
editor_pubmed_links_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name,
>   Pubmed_URL AS pubmed_url,
>   Relevant_Pubmed_URLs AS relevant_pubmed_urls
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile` AS Editor
> ```

Downloading: 100%|██████████| 634/634 [00:00<00:00, 662.29rows/s]

634





Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,..."
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,..."
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,..."
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,..."
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,..."


In [8]:
editor_pubmed_links_df['relevant_pubmed_ids'] = apply_skip_null(
    editor_pubmed_links_df['relevant_pubmed_urls'],
    get_ncbi_pubmed_article_ids
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls,relevant_pubmed_ids
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...","[25902489, 23341596, 20946384]"
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,...",[31764994]
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,...","[30287660, 30135164, 29493062, 29148170, 25317..."
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,...","[30105039, 28386959, 26946468, 25297722, 18055..."
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,...","[20349247, 28827317, 28522532]"


In [9]:
editor_pubmed_links_df['resolved_pubmed_url'] = apply_skip_null(
    editor_pubmed_links_df['pubmed_url'],
    lambda pubmed_url: resolve_url_if_not_ncbi_domain(normalize_url(pubmed_url))
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls,relevant_pubmed_ids,resolved_pubmed_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...","[25902489, 23341596, 20946384]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,...",[31764994],https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,...","[30287660, 30135164, 29493062, 29148170, 25317...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,...","[30105039, 28386959, 26946468, 25297722, 18055...",https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,...","[20349247, 28827317, 28522532]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...


In [10]:
editor_pubmed_links_df['search_term'] = apply_skip_null(
    editor_pubmed_links_df['resolved_pubmed_url'],
    lambda pubmed_url: (
        get_ncbi_search_term(pubmed_url)
        if is_ncbi_search_url(pubmed_url)
        else None
    )
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls,relevant_pubmed_ids,resolved_pubmed_url,search_term
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...","[25902489, 23341596, 20946384]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author]
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,...",[31764994],https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,Nwaogu CJ[Author]
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,...","[30287660, 30135164, 29493062, 29148170, 25317...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,Schmid B[Author]
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,...","[30105039, 28386959, 26946468, 25297722, 18055...",https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,Kant M R[au]
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,...","[20349247, 28827317, 28522532]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,Donoso DA[Author]


In [11]:
editor_pubmed_links_df['parsed_search_term'] = apply_skip_null(
    editor_pubmed_links_df['search_term'],
    parse_term_query
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls,relevant_pubmed_ids,resolved_pubmed_url,search_term,parsed_search_term
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...","[25902489, 23341596, 20946384]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author],{'include': {'author': ['Watanabe YY']}}
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,...",[31764994],https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,Nwaogu CJ[Author],{'include': {'author': ['Nwaogu CJ']}}
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,...","[30287660, 30135164, 29493062, 29148170, 25317...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,Schmid B[Author],{'include': {'author': ['Schmid B']}}
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,...","[30105039, 28386959, 26946468, 25297722, 18055...",https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,Kant M R[au],{'include': {'author': ['Kant M R']}}
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,...","[20349247, 28827317, 28522532]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,Donoso DA[Author],{'include': {'author': ['Donoso DA']}}


In [12]:
editor_pubmed_links_df['is_ncbi_bibliography_url'] = apply_skip_null(
    editor_pubmed_links_df['resolved_pubmed_url'],
    is_ncbi_bibliography_url
)
print('is_ncbi_bibliography_url:\n%s' % editor_pubmed_links_df['is_ncbi_bibliography_url'].value_counts())
editor_pubmed_links_df.head()

is_ncbi_bibliography_url:
False    630
True       3
Name: is_ncbi_bibliography_url, dtype: int64


Unnamed: 0,person_id,name,pubmed_url,relevant_pubmed_urls,relevant_pubmed_ids,resolved_pubmed_url,search_term,parsed_search_term,is_ncbi_bibliography_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,"[https://www.ncbi.nlm.nih.gov/pubmed/25902489,...","[25902489, 23341596, 20946384]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author],{'include': {'author': ['Watanabe YY']}},False
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,"[https://www.ncbi.nlm.nih.gov/pubmed/31764994,...",[31764994],https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,Nwaogu CJ[Author],{'include': {'author': ['Nwaogu CJ']}},False
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,"[https://www.ncbi.nlm.nih.gov/pubmed/30287660,...","[30287660, 30135164, 29493062, 29148170, 25317...",https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,Schmid B[Author],{'include': {'author': ['Schmid B']}},False
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,"[https://www.ncbi.nlm.nih.gov/pubmed/30105039,...","[30105039, 28386959, 26946468, 25297722, 18055...",https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,Kant M R[au],{'include': {'author': ['Kant M R']}},False
4,136296,David Donoso,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,"[https://www.ncbi.nlm.nih.gov/pubmed/20349247,...","[20349247, 28827317, 28522532]",https://www.ncbi.nlm.nih.gov/pubmed/?term=Dono...,Donoso DA[Author],{'include': {'author': ['Donoso DA']}},False


In [13]:
print('writing to:', editor_parsed_pubmed_links_table_name)
to_gbq(
    editor_pubmed_links_df,
    project_id=project_id,
    destination_table=editor_parsed_pubmed_links_table_name,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_pubmed_links
done
