In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'

max_workers = 10
max_manuscripts = 100000
manuscript_upload_batch_size = 10000
email = 'd.ecer@elifesciences.org'

In [2]:
import logging
from datetime import datetime
from functools import partial
from concurrent.futures import ThreadPoolExecutor 
from typing import List

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

import google.cloud.exceptions

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.europepmc import (
    EUROPEPMC_MAX_PAGE_SIZE,
    EuropePMCApi,
    europepmc_requests_retry_session
)
from data_science_pipeline.utils.misc import iter_batches
from data_science_pipeline.utils.bq import (
    with_limit_sql,
    is_bq_not_found_exception,
    load_json_list_and_append_to_bq_table_with_auto_schema
)
from data_science_pipeline.utils.pandas import read_csv
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

logging.basicConfig(level='INFO')
# logging.getLogger('data_science_pipeline.utils.bq').setLevel(logging.DEBUG)

In [4]:
editor_parsed_pubmed_links_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_links'
)

editor_parsed_pubmed_ids_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_ids'
)

external_manuscript_summary_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='external_manuscript_summary'
)

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
existing_pmids_sql = (
    'SELECT pmid FROM `{table}`'
).format(table=external_manuscript_summary_output_table_name)

all_pmids_sql = '\n'.join([
    'SELECT DISTINCT pubmed_id',
    'FROM `{editor_parsed_pubmed_ids_table_name}`',
    'JOIN UNNEST(pubmed_ids) AS pubmed_id',
    '',
    'UNION DISTINCT',
    '',
    'SELECT DISTINCT pubmed_id',
    'FROM `{editor_parsed_pubmed_links_table_name}`',
    'JOIN UNNEST(relevant_pubmed_ids) AS pubmed_id',
    '',
    'UNION DISTINCT',
    '',
    'SELECT DISTINCT pubmed_id',
    'FROM `{editor_parsed_pubmed_links_table_name}`',
    'JOIN UNNEST(parsed_search_term.include.pmid) AS pubmed_id'
]).format(
    editor_parsed_pubmed_ids_table_name=editor_parsed_pubmed_ids_table_name,
    editor_parsed_pubmed_links_table_name=editor_parsed_pubmed_links_table_name
)

remaining_pmids_sql = '\n'.join([
    'SELECT DISTINCT pubmed_id',
    'FROM ({all_pmids_sql})',
    'WHERE pubmed_id NOT IN ({existing_pmids_sql})'
]).format(
    all_pmids_sql=all_pmids_sql,
    existing_pmids_sql=existing_pmids_sql
)

try:
    remaining_pmids_df = read_big_query(with_limit_sql(
        remaining_pmids_sql,
        limit=max_manuscripts
    ))
except Exception as e:
    if not is_bq_not_found_exception(e):
        raise
    print('table not found: %s', external_manuscript_summary_output_table_name)
    remaining_pmids_df = read_big_query(with_limit_sql(
        all_pmids_sql,
        limit=max_manuscripts
    ))
print(len(remaining_pmids_df))
remaining_pmids_df.head()

> ```sql
> SELECT DISTINCT pubmed_id
> FROM (SELECT DISTINCT pubmed_id
> FROM `de_dev.data_science_editor_pubmed_ids`
> JOIN UNNEST(pubmed_ids) AS pubmed_id
> 
> UNION DISTINCT
> 
> SELECT DISTINCT pubmed_id
> FROM `de_dev.data_science_editor_pubmed_links`
> JOIN UNNEST(relevant_pubmed_ids) AS pubmed_id
> 
> UNION DISTINCT
> 
> SELECT DISTINCT pubmed_id
> FROM `de_dev.data_science_editor_pubmed_links`
> JOIN UNNEST(parsed_search_term.include.pmid) AS pubmed_id)
> WHERE pubmed_id NOT IN (SELECT pmid FROM `de_dev.data_science_external_manuscript_summary`)
> LIMIT 100000
> ```

Downloading: 100%|██████████| 121/121 [00:00<00:00, 159.11rows/s]

121





Unnamed: 0,pubmed_id
0,28935940
1,27811268
2,26291162
3,28812560
4,29875305


In [7]:
remaining_pmids_df.dtypes

pubmed_id    object
dtype: object

In [8]:
query_pubmed_ids = remaining_pmids_df['pubmed_id'].values
len(query_pubmed_ids)

121

In [9]:
query_pubmed_ids_batches = list(iter_batches(query_pubmed_ids, EUROPEPMC_MAX_PAGE_SIZE))
print('number of batches:', len(query_pubmed_ids_batches))
if query_pubmed_ids_batches:
    print('first batch:', len(query_pubmed_ids_batches[0]))

number of batches: 1
first batch: 121


In [10]:
def handle_http_error(error: BaseException, data: dict = None):
    LOGGER.warning('error: %s, data=%s', error, data)

In [11]:
def add_provenance(manuscript_summary_list: List[dict]) -> List[dict]:
    imported_timestamp = datetime.utcnow().isoformat()
    provenance = {
        'source': 'europepmc',
        'imported_timestamp': imported_timestamp
    }
    return [
        {
            **manuscript_summary,
            'provenance': provenance
        }
        for manuscript_summary in manuscript_summary_list
    ]

In [12]:
with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(
        session,
        on_error=handle_http_error,
        params={'email': email}
    )
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        retrieved_editor_manuscript_list_batches_it = tqdm(
            executor.map(
                europepmc_api.get_summary_by_page_pmids,
                query_pubmed_ids_batches
            ),
            total=len(query_pubmed_ids_batches),
            leave=False
        )
        retrieved_flattened_manuscript_list_it = (
            manuscript_summary
            for manuscript_summary_list in retrieved_editor_manuscript_list_batches_it
            for manuscript_summary in manuscript_summary_list
        )
        manuscript_upload_batch_list_it = iter_batches(
            retrieved_flattened_manuscript_list_it,
            manuscript_upload_batch_size
        )
        for manuscript_upload_list in manuscript_upload_batch_list_it:
            manuscript_upload_list = list(manuscript_upload_list)
            print('writing to: %s (%d rows)' % (
                external_manuscript_summary_output_table_name,
                len(manuscript_upload_list)
            ))
            load_json_list_and_append_to_bq_table_with_auto_schema(
                add_provenance(manuscript_upload_list),
                project_id=project_id,
                table_name=external_manuscript_summary_output_table_name
            )
print('done')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



done
