In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'

max_workers = 10
max_manuscripts = 100000
email = 'd.ecer@elifesciences.org'

In [2]:
import json
import re
import os
import logging
from datetime import datetime
from functools import partial
from concurrent.futures import ThreadPoolExecutor 
from typing import List

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

import google.cloud.exceptions

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.europepmc import (
    EUROPEPMC_MAX_PAGE_SIZE,
    EuropePMCApi,
    europepmc_requests_retry_session
)
from data_science_pipeline.utils.misc import iter_batches
from data_science_pipeline.utils.bq import (
    is_bq_not_found_exception,
    load_json_list_and_append_to_bq_table_with_auto_schema
)
from data_science_pipeline.utils.pandas import read_csv
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
LOGGER = logging.getLogger(__name__)

logging.basicConfig(level='INFO')
# logging.getLogger('data_science_pipeline.utils.bq').setLevel(logging.DEBUG)

In [4]:
editor_parsed_pubmed_ids_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_pubmed_ids'
)

external_manuscript_summary_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='external_manuscript_summary'
)

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
_sql = (
    'SELECT pmid FROM `{table}`'
).format(table=external_manuscript_summary_output_table_name)

try:
    existing_pmids_df = read_big_query(_sql)
except Exception as e:
    if not is_bq_not_found_exception(e):
        raise
    print('table not found: %s', external_manuscript_summary_output_table_name)
    existing_pmids_df = pd.DataFrame(columns=['pmid'])
print(len(existing_pmids_df))
existing_pmids_df.head()

> ```sql
> SELECT pmid FROM `de_dev.data_science_external_manuscript_summary`
> ```

Downloading: 100%|██████████| 692907/692907 [00:21<00:00, 32050.26rows/s]
INFO:pandas_gbq.gbq:Total time taken 25.37 s.
Finished at 2020-06-18 07:24:08.


692907


Unnamed: 0,pmid
0,20098489
1,25221700
2,24949300
3,24932386
4,24554985


In [7]:
existing_pmids_df.dtypes

pmid    object
dtype: object

In [8]:
existing_pmids_set = set(existing_pmids_df['pmid'])
len(existing_pmids_set)

689910

In [9]:
_sql = '\n'.join([
    'SELECT person_id, name, pubmed_url, pubmed_ids',
    'FROM `{table}`'
]).format(table=editor_parsed_pubmed_ids_table_name)

editor_pubmed_ids_df = read_big_query(_sql)
print(len(editor_pubmed_ids_df))
editor_pubmed_ids_df.head()

> ```sql
> SELECT person_id, name, pubmed_url, pubmed_ids
> FROM `de_dev.data_science_editor_pubmed_ids`
> ```

Downloading: 100%|██████████| 612/612 [00:07<00:00, 82.22rows/s]
INFO:pandas_gbq.gbq:Total time taken 8.64 s.
Finished at 2020-06-18 07:24:17.


612


Unnamed: 0,person_id,name,pubmed_url,pubmed_ids
0,15520,Clare Blackburn,http://www.ncbi.nlm.nih.gov/pubmed?term=Blackb...,"[32467237, 32431707, 32421568, 32180160, 32324..."
1,15012,Andrew Brack,https://www.ncbi.nlm.nih.gov/pubmed/?term=Brac...,"[32198156, 32234209, 31495781, 31006621, 29338..."
2,7036,Lee Rubin,http://www.ncbi.nlm.nih.gov/pubmed/?term=Rubin...,"[32413331, 31902706, 31644914, 31551601, 31284..."
3,77872,Martin Pera,https://www.ncbi.nlm.nih.gov/pubmed/?term=Pera...,"[32415101, 32404661, 32248834, 32224728, 32203..."
4,134890,Melanie Königshoff,https://www.ncbi.nlm.nih.gov/pubmed/?term=K%C3...,"[32526076, 32109549, 32023086, 32096543, 31991..."


In [18]:
all_pubmed_ids = [
    pubmed_id
    for pubmed_ids in editor_pubmed_ids_df.sort_values('pubmed_count')['pubmed_ids'].values
    for pubmed_id in pubmed_ids
    if pubmed_id
]
len(all_pubmed_ids)

925382

In [19]:
unique_pubmed_ids = list(pd.Series(all_pubmed_ids).drop_duplicates().values)
len(unique_pubmed_ids)

838956

In [20]:
_ser = pd.Series(unique_pubmed_ids)
remaining_pubmed_ids = list(_ser[~_ser.isin(existing_pmids_set)].values)
len(remaining_pubmed_ids)

149091

In [21]:
query_pubmed_ids = remaining_pubmed_ids
if max_manuscripts:
    query_pubmed_ids = query_pubmed_ids[:max_manuscripts]
len(query_pubmed_ids)

100000

In [22]:
query_pubmed_ids_batches = list(iter_batches(query_pubmed_ids, EUROPEPMC_MAX_PAGE_SIZE))
print('number of batches:', len(query_pubmed_ids_batches))
if query_pubmed_ids_batches:
    print('first batch:', len(query_pubmed_ids_batches[0]))

number of batches: 100
first batch: 1000


In [23]:
def handle_http_error(error: BaseException, data: dict = None):
    LOGGER.warning('error: %s, data=%s', error, data)

In [24]:
with europepmc_requests_retry_session() as session:
    europepmc_api = EuropePMCApi(
        session,
        on_error=handle_http_error,
        params={'email': email}
    )
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        manuscript_summary_list_batches = list(tqdm(
            executor.map(
                europepmc_api.get_summary_by_page_pmids,
                query_pubmed_ids_batches
            ),
            total=len(query_pubmed_ids_batches),
            leave=False
        ))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

In [25]:
len(manuscript_summary_list_batches)

100

In [26]:
manuscript_summary_flattened = [
    manuscript_summary
    for manuscript_summary_list in manuscript_summary_list_batches
    for manuscript_summary in manuscript_summary_list
]
len(manuscript_summary_flattened)

100000

In [27]:
for manuscript_summary in manuscript_summary_flattened:
    manuscript_summary['provenance'] = {
        'source': 'europepmc',
        'imported_timestamp': datetime.utcnow().isoformat()
    }

In [28]:
if manuscript_summary_flattened:
    manuscript_summary_flattened[0]

In [29]:
if not manuscript_summary_flattened:
    print('no data to upload')
else:
    print('writing to:', external_manuscript_summary_output_table_name)
    load_json_list_and_append_to_bq_table_with_auto_schema(
        manuscript_summary_flattened,
        project_id=project_id,
        table_name=external_manuscript_summary_output_table_name
    )
    print('done')

writing to: de_dev.data_science_external_manuscript_summary


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processing line 4000
INFO:root:Processing line 5000
INFO:root:Processing line 6000
INFO:root:Processing line 7000
INFO:root:Processing line 8000
INFO:root:Processing line 9000
INFO:root:Processing line 10000
INFO:root:Processing line 11000
INFO:root:Processing line 12000
INFO:root:Processing line 13000
INFO:root:Processing line 14000
INFO:root:Processing line 15000
INFO:root:Processing line 16000
INFO:root:Processing line 17000
INFO:root:Processing line 18000
INFO:root:Processing line 19000
INFO:root:Processing line 20000
INFO:root:Processing line 21000
INFO:root:Processing line 22000
INFO:root:Processing line 23000
INFO:root:Processing line 24000
INFO:root:Processing line 25000
INFO:root:Processing line 26000
INFO:root:Processing line 27000
INFO:root:Processing line 28000
INFO:root:Processing line 29000
INFO:root:Processing line 30000
INFO:root:Processing line 31000
INFO:root:Process

done
