In [None]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'

In [None]:
import logging
from functools import partial

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.pubmed import (
    normalize_url,
    resolve_url_if_not_ncbi_domain,
    get_ncbi_pubmed_article_ids,
    is_ncbi_search_url,
    is_ncbi_bibliography_url,
    get_ncbi_search_term,
    parse_term_query
)
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.pandas import apply_skip_null
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [None]:
LOGGER = logging.getLogger(__name__)

In [None]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

In [None]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [None]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [None]:
editor_pubmed_links_df = read_big_query(
    get_sql('editor-pubmed-links.sql').format(
        **default_query_props
    )
)
print(len(editor_pubmed_links_df))
editor_pubmed_links_df.head()

In [None]:
editor_pubmed_links_df['relevant_pubmed_ids'] = apply_skip_null(
    editor_pubmed_links_df['relevant_pubmed_urls'],
    get_ncbi_pubmed_article_ids
)
editor_pubmed_links_df.head()

In [None]:
editor_pubmed_links_df['resolved_pubmed_url'] = apply_skip_null(
    editor_pubmed_links_df['pubmed_url'],
    lambda pubmed_url: resolve_url_if_not_ncbi_domain(normalize_url(pubmed_url))
)
editor_pubmed_links_df.head()

In [None]:
editor_pubmed_links_df['search_term'] = apply_skip_null(
    editor_pubmed_links_df['resolved_pubmed_url'],
    lambda pubmed_url: (
        get_ncbi_search_term(pubmed_url)
        if is_ncbi_search_url(pubmed_url)
        else None
    )
)
editor_pubmed_links_df.head()

In [None]:
editor_pubmed_links_df['parsed_search_term'] = apply_skip_null(
    editor_pubmed_links_df['search_term'],
    parse_term_query
)
editor_pubmed_links_df.head()

In [None]:
editor_pubmed_links_df['is_ncbi_bibliography_url'] = apply_skip_null(
    editor_pubmed_links_df['resolved_pubmed_url'],
    is_ncbi_bibliography_url
)
print('is_ncbi_bibliography_url:\n%s' % editor_pubmed_links_df['is_ncbi_bibliography_url'].value_counts())
editor_pubmed_links_df.head()

In [None]:
print('writing to:', editor_parsed_pubmed_links_table_name)
to_gbq(
    editor_pubmed_links_df,
    project_id=project_id,
    destination_table=editor_parsed_pubmed_links_table_name,
    if_exists='replace'
)
print('done')