In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import logging
from functools import partial
from typing import List

import numpy as np
import pandas as pd

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.pubmed import (
    normalize_url,
    resolve_url_if_not_ncbi_domain,
    is_ncbi_search_url,
    get_ncbi_search_term,
    parse_term_query
)
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.pandas import apply_skip_null
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

In [4]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [7]:
editor_pubmed_links_df = read_big_query(
    get_sql('editor-pubmed-links.sql').format(
        **default_query_props
    )
)
print(len(editor_pubmed_links_df))
editor_pubmed_links_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name,
>   Pubmed_URL AS pubmed_url
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile` AS Editor
> ```

Downloading: 100%|██████████| 616/616 [00:00<00:00, 624.29rows/s]

616





Unnamed: 0,person_id,name,pubmed_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...
4,7970,Laurent Keller,http://www.ncbi.nlm.nih.gov/pubmed/?term=Kelle...


In [8]:
editor_pubmed_links_df['resolved_pubmed_url'] = editor_pubmed_links_df['pubmed_url'].apply(
    lambda pubmed_url: resolve_url_if_not_ncbi_domain(normalize_url(pubmed_url))
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,resolved_pubmed_url
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...
4,7970,Laurent Keller,http://www.ncbi.nlm.nih.gov/pubmed/?term=Kelle...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Kell...


In [9]:
editor_pubmed_links_df['search_term'] = editor_pubmed_links_df['resolved_pubmed_url'].apply(
    lambda pubmed_url: (
        get_ncbi_search_term(pubmed_url)
        if is_ncbi_search_url(pubmed_url)
        else None
    )
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,resolved_pubmed_url,search_term
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author]
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,Nwaogu CJ[Author]
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,Schmid B[Author]
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,Kant M R[au]
4,7970,Laurent Keller,http://www.ncbi.nlm.nih.gov/pubmed/?term=Kelle...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Kell...,Keller L[Author]


In [10]:
editor_pubmed_links_df['parsed_search_term'] = apply_skip_null(
    editor_pubmed_links_df['search_term'],
    parse_term_query
)
editor_pubmed_links_df.head()

Unnamed: 0,person_id,name,pubmed_url,resolved_pubmed_url,search_term,parsed_search_term
0,126873,Yuuki Watanabe,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Wata...,Watanabe YY[Author],{'include': {'author': ['Watanabe YY']}}
1,178962,Chima Nwaogu,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Nwao...,Nwaogu CJ[Author],{'include': {'author': ['Nwaogu CJ']}}
2,70207,Bernhard Schmid,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Schm...,Schmid B[Author],{'include': {'author': ['Schmid B']}}
3,14193,Merijn Kant,http://www.ncbi.nlm.nih.gov/pubmed?term=Kant%2...,https://www.ncbi.nlm.nih.gov/pubmed?term=Kant%...,Kant M R[au],{'include': {'author': ['Kant M R']}}
4,7970,Laurent Keller,http://www.ncbi.nlm.nih.gov/pubmed/?term=Kelle...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Kell...,Keller L[Author],{'include': {'author': ['Keller L']}}


In [11]:
print('writing to:', editor_parsed_pubmed_links_table_name)
to_gbq(
    editor_pubmed_links_df,
    project_id=project_id,
    destination_table=editor_parsed_pubmed_links_table_name,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_pubmed_links
done
