In [1]:
project_id = 'elife-data-pipeline'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
target_paper_count = 50
# max_paper_count is ignore if it is a good match
max_paper_count = 2000

In [2]:
import logging
import sys
from functools import partial

import pandas as pd

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import run_query_and_save_to_table, get_client
from data_science_pipeline.utils.jupyter import (
    printmd,
    to_markdown_sql,
    read_big_query as _read_big_query
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
query_name = 'disambiguated_editor_papers'
destination_table_name = ''.join([output_table_prefix, query_name])

In [4]:
logging.basicConfig(level='INFO', stream=sys.stdout)

In [5]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [6]:
print('processing %s' % query_name)
_sql = get_sql('%s.sql' % query_name).format(
    project=project_id,
    dataset=output_dataset,
    target_paper_count=target_paper_count,
    max_paper_count=max_paper_count
)
printmd(to_markdown_sql(_sql))
run_query_and_save_to_table(
    client=get_client(project_id=project_id),
    query=_sql,
    destination_dataset=output_dataset,
    destination_table_name=destination_table_name
)
print('done')

processing disambiguated_editor_papers


> ```sql
> -- Main features:
> --    - Disambiguates editor linked papers as far as possible
> --      Gives each editor paper match a priority
> --    - Select papers with increasing priority until at least a target number of papers is reached
> 
> WITH t_editor AS (
>   SELECT
>     editor.person_id,
>     editor.name,
>     editor.relevant_pubmed_ids
>   FROM `elife-data-pipeline.de_dev.data_science_editor_pubmed_links` AS editor
> ),
> 
> t_pubmed_id_with_priority_by_person_id AS (
>   SELECT DISTINCT editor.person_id, pmid, 1 AS priority
>   FROM t_editor AS editor
>   JOIN UNNEST(relevant_pubmed_ids) AS pmid
> 
>   UNION DISTINCT
> 
>   SELECT DISTINCT
>     person_id,
>     pmid,
>     CASE
>       WHEN has_matching_orcid THEN 1
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND (has_matching_affiliation OR has_matching_previous_affiliation)
>         THEN 2
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_postal_code
>         THEN 3
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_city
>         THEN 4
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_country
>         THEN 5
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name_letter
>         AND (has_matching_affiliation OR has_matching_previous_affiliation)
>         THEN 6
>       ELSE 1000 - author_match_score
>     END AS priority
>   FROM `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers_details`
> ),
> 
> t_priority_count_by_person_id AS (
>   SELECT person_id, priority, COUNT(*) AS priority_count
>   FROM t_pubmed_id_with_priority_by_person_id 
>   GROUP BY person_id, priority
> ),
> 
> t_priority_count_and_total_priority_count_by_person_id AS (
>   SELECT
>     current_counts.*,
>     (
>       priority_count
>       + (
>         SELECT COALESCE(SUM(priority_count), 0)
>         FROM t_priority_count_by_person_id AS higher_priority_counts
>         WHERE higher_priority_counts.person_id = current_counts.person_id
>         AND higher_priority_counts.priority < current_counts.priority
>       )
>     ) AS total_priority_count
>   FROM t_priority_count_by_person_id AS current_counts
> ),
> 
> t_priority_below_and_above_target_count_by_person_id AS (
>   SELECT
>     person_id,
>     MAX(IF(
>       total_priority_count < 50,
>       priority,
>       NULL
>     )) AS priority_below_target_count,
>     MIN(IF(
>       (
>         total_priority_count >= 50
>         AND (
>           -- upper limit, unless we are very sure (priority 1 or 2)
>           total_priority_count < 2000
>           OR priority <= 2
>         )
>       ),
>       priority,
>       NULL
>     )) AS priority_above_target_count
>   FROM t_priority_count_and_total_priority_count_by_person_id
>   GROUP BY person_id
> ),
> 
> t_max_preferred_priority_by_person_id AS (
>   SELECT
>     person_id,
>     COALESCE(priority_above_target_count, priority_below_target_count) AS max_preferred_priority
>   FROM t_priority_below_and_above_target_count_by_person_id
> ),
> 
> t_preferred_pubmed_id_by_person_id AS (
>   SELECT DISTINCT
>     max_preferred.person_id,
>     paper.pmid
>   FROM t_max_preferred_priority_by_person_id AS max_preferred
>   JOIN t_pubmed_id_with_priority_by_person_id AS paper
>     ON paper.person_id = max_preferred.person_id
>     AND paper.priority <= max_preferred.max_preferred_priority
>     AND paper.pmid IS NOT NULL
> )
> 
> SELECT
>   editor.person_id,
>   editor.name,
>   ARRAY(
>     SELECT pmid
>     FROM t_preferred_pubmed_id_by_person_id AS preferred_paper
>     WHERE preferred_paper.person_id = editor.person_id
>   ) AS disambiguated_pubmed_ids
> FROM t_editor AS editor
> ```

INFO:data_science_pipeline.utils.bq:ran query and saved to: de_dev.data_science_disambiguated_editor_papers, total rows: 0, took: 10.573s


done


In [7]:
_sql = get_sql('disambiguated_editor_papers_count.sql').format(
    project=project_id,
    dataset=output_dataset
)
editor_pubmed_count_df = read_big_query(_sql)
print(len(editor_pubmed_count_df))
editor_pubmed_count_df.head(3)

> ```sql
> WITH t_editor_pubmed_ids AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY name ORDER BY provenance.imported_timestamp DESC) AS name_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_editor_pubmed_ids`
> )
> 
> SELECT
>   Profile.Person_ID AS person_id,
>   Profile.Name AS name,
>   ARRAY_LENGTH(papers.disambiguated_pubmed_ids) AS pubmed_count,
>   ARRAY_LENGTH(editor_pubmed_links.relevant_pubmed_urls) AS relevant_pubmed_url_count,
>   ARRAY_LENGTH(editor_pubmed_links.relevant_pubmed_ids) AS relevant_pubmed_id_count,
>   ARRAY_LENGTH(editor_pubmed_ids.pubmed_ids) AS retrieved_pubmed_id_count,
>   editor_pubmed_links.pubmed_url,
>   editor_pubmed_links.search_term
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile` AS Profile
> LEFT JOIN `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers` AS papers
>   ON papers.person_id = Profile.Person_ID
> LEFT JOIN `elife-data-pipeline.de_dev.data_science_editor_pubmed_links` AS editor_pubmed_links
>   ON editor_pubmed_links.person_id = Profile.Person_ID
> LEFT JOIN t_editor_pubmed_ids AS editor_pubmed_ids
>   ON editor_pubmed_ids.person_id = Profile.Person_ID
> WHERE Profile.Name IS NOT NULL
> ORDER BY Profile.Name
> ```

772


Unnamed: 0,person_id,name,pubmed_count,relevant_pubmed_url_count,relevant_pubmed_id_count,retrieved_pubmed_id_count,pubmed_url,search_term
0,196070,Aaron Frank,,,,,,
1,86941,Adam Frost,,,,,,
2,4372,Adam Linstedt,,,,,,


In [8]:
with pd.option_context("display.max_rows", 100):
    print(
        editor_pubmed_count_df
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count']]
        .to_string(index=False)
    )

person_id                             name  pubmed_count  relevant_pubmed_id_count  retrieved_pubmed_id_count
   196070                      Aaron Frank           NaN                       NaN                        NaN
    86941                       Adam Frost           NaN                       NaN                        NaN
     4372                    Adam Linstedt           NaN                       NaN                        NaN
    68914                  Adrien Peyrache           NaN                       NaN                        NaN
     7131                    Adèle Marston           NaN                       NaN                        NaN
    72412                  Agnese Seminara           NaN                       NaN                        NaN
     1532              Agnieszka Chacinska           NaN                       NaN                        NaN
     3258                     Ahmet Yildiz           NaN                       NaN                        NaN
   101166 

In [9]:
print('editors with pubmed urls without parsed pubmed id:\n%s' % editor_pubmed_count_df[
    editor_pubmed_count_df['relevant_pubmed_url_count'] > editor_pubmed_count_df['relevant_pubmed_id_count']
][['person_id', 'name', 'relevant_pubmed_url_count', 'relevant_pubmed_id_count']].to_string(index=False))

editors with pubmed urls without parsed pubmed id:
Empty DataFrame
Columns: [person_id, name, relevant_pubmed_url_count, relevant_pubmed_id_count]
Index: []


In [10]:
print(
    'editors without disambiguated pubmed papers despite having relevant pubmed ids:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] == 0)
            & (editor_pubmed_count_df['relevant_pubmed_id_count'] > 0)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count']]
        .to_string(index=False)
    )
)

editors without disambiguated pubmed papers despite having relevant pubmed ids:
Empty DataFrame
Columns: [person_id, name, pubmed_count, relevant_pubmed_id_count, retrieved_pubmed_id_count]
Index: []


In [11]:
print(
    'editors with less than five disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (editor_pubmed_count_df['pubmed_count'] < 5)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count']]
        .to_string(index=False)
    )
)

editors with less than five disambiguated pubmed papers:
Empty DataFrame
Columns: [person_id, name, pubmed_count, relevant_pubmed_id_count, retrieved_pubmed_id_count]
Index: []


In [12]:
print(
    'editors without additional disambiguated pubmed papers (apart from relevant pubmed ids):\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] <= editor_pubmed_count_df['relevant_pubmed_id_count'])
            & (
                editor_pubmed_count_df['retrieved_pubmed_id_count']
                > editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors without additional disambiguated pubmed papers (apart from relevant pubmed ids):
Empty DataFrame
Columns: [person_id, name, relevant_pubmed_id_count, retrieved_pubmed_id_count, search_term]
Index: []


In [13]:
print(
    'editors with only relevant pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (
                editor_pubmed_count_df['retrieved_pubmed_id_count']
                <= editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors with only relevant pubmed papers:
Empty DataFrame
Columns: [person_id, name, relevant_pubmed_id_count, retrieved_pubmed_id_count, search_term]
Index: []


In [14]:
print(
    'editors without any disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            editor_pubmed_count_df['pubmed_count'] == 0
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'retrieved_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors without any disambiguated pubmed papers:
Empty DataFrame
Columns: [person_id, name, relevant_pubmed_id_count, retrieved_pubmed_id_count, search_term]
Index: []
