In [1]:
project_id = 'elife-data-pipeline'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
target_paper_count = 50
# max_paper_count is ignore if it is a good match
max_paper_count = 2000

In [2]:
import logging
import sys

import pandas as pd

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import run_query_and_save_to_table, get_client
from data_science_pipeline.utils.jupyter import printmd, to_markdown_sql, read_big_query

In [3]:
query_name = 'disambiguated_editor_papers'
destination_table_name = ''.join([output_table_prefix, query_name])

In [4]:
logging.basicConfig(level='INFO', stream=sys.stdout)

In [5]:
print('processing %s' % query_name)
_sql = get_sql('%s.sql' % query_name).format(
    project=project_id,
    dataset=output_dataset,
    target_paper_count=target_paper_count,
    max_paper_count=max_paper_count
)
printmd(to_markdown_sql(_sql))
run_query_and_save_to_table(
    client=get_client(project_id=project_id),
    query=_sql,
    destination_dataset=output_dataset,
    destination_table_name=destination_table_name
)
print('done')

processing disambiguated_editor_papers


> ```sql
> -- Main features:
> --    - Disambiguates editor linked papers as far as possible
> --      Gives each editor paper match a priority
> --    - Select papers with increasing priority until at least a target number of papers is reached
> 
> WITH t_editor AS (
>   SELECT
>     editor.person_id,
>     editor.name,
>     editor.relevant_pubmed_ids
>   FROM `elife-data-pipeline.de_dev.data_science_editor_pubmed_links` AS editor
> ),
> 
> t_pubmed_id_with_priority_by_person_id AS (
>   SELECT DISTINCT editor.person_id, pmid, 1 AS priority
>   FROM t_editor AS editor
>   JOIN UNNEST(relevant_pubmed_ids) AS pmid
> 
>   UNION DISTINCT
> 
>   SELECT DISTINCT
>     person_id,
>     pmid,
>     CASE
>       WHEN has_matching_orcid THEN 1
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND (has_matching_affiliation OR has_matching_previous_affiliation)
>         THEN 2
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_postal_code
>         THEN 3
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_city
>         THEN 4
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name
>         AND has_matching_country
>         THEN 5
>       WHEN NOT COALESCE(has_mismatching_orcid, FALSE)
>         AND has_matching_last_name
>         AND has_matching_first_name_letter
>         AND (has_matching_affiliation OR has_matching_previous_affiliation)
>         THEN 6
>       ELSE 1000 - author_match_score
>     END AS priority
>   FROM `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers_details`
> ),
> 
> t_priority_count_by_person_id AS (
>   SELECT person_id, priority, COUNT(*) AS priority_count
>   FROM t_pubmed_id_with_priority_by_person_id 
>   GROUP BY person_id, priority
> ),
> 
> t_priority_count_and_total_priority_count_by_person_id AS (
>   SELECT
>     current_counts.*,
>     (
>       priority_count
>       + (
>         SELECT COALESCE(SUM(priority_count), 0)
>         FROM t_priority_count_by_person_id AS higher_priority_counts
>         WHERE higher_priority_counts.person_id = current_counts.person_id
>         AND higher_priority_counts.priority < current_counts.priority
>       )
>     ) AS total_priority_count
>   FROM t_priority_count_by_person_id AS current_counts
> ),
> 
> t_priority_below_and_above_target_count_by_person_id AS (
>   SELECT
>     person_id,
>     MAX(IF(
>       total_priority_count < 50,
>       priority,
>       NULL
>     )) AS priority_below_target_count,
>     MIN(IF(
>       (
>         total_priority_count >= 50
>         AND (
>           -- upper limit, unless we are very sure (priority 1 or 2)
>           total_priority_count < 2000
>           OR priority <= 2
>         )
>       ),
>       priority,
>       NULL
>     )) AS priority_above_target_count
>   FROM t_priority_count_and_total_priority_count_by_person_id
>   GROUP BY person_id
> ),
> 
> t_max_preferred_priority_by_person_id AS (
>   SELECT
>     person_id,
>     COALESCE(priority_above_target_count, priority_below_target_count) AS max_preferred_priority
>   FROM t_priority_below_and_above_target_count_by_person_id
> ),
> 
> t_preferred_pubmed_id_by_person_id AS (
>   SELECT DISTINCT
>     max_preferred.person_id,
>     paper.pmid
>   FROM t_max_preferred_priority_by_person_id AS max_preferred
>   JOIN t_pubmed_id_with_priority_by_person_id AS paper
>     ON paper.person_id = max_preferred.person_id
>     AND paper.priority <= max_preferred.max_preferred_priority
>     AND paper.pmid IS NOT NULL
> )
> 
> SELECT
>   editor.person_id,
>   editor.name,
>   ARRAY(
>     SELECT pmid
>     FROM t_preferred_pubmed_id_by_person_id AS preferred_paper
>     WHERE preferred_paper.person_id = editor.person_id
>   ) AS disambiguated_pubmed_ids
> FROM t_editor AS editor
> ```

INFO:data_science_pipeline.utils.bq:ran query and saved to: de_dev.data_science_disambiguated_editor_papers, total rows: 634, took: 12.202s
done


In [6]:
_sql = get_sql('disambiguated_editor_papers_count.sql').format(
    project=project_id,
    dataset=output_dataset
)
editor_pubmed_count_df = read_big_query(_sql)
print(len(editor_pubmed_count_df))
editor_pubmed_count_df.head(3)

> ```sql
> WITH t_editor_pubmed_ids AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY name ORDER BY provenance.imported_timestamp DESC) AS name_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_editor_pubmed_ids`
> )
> 
> SELECT
>   Profile.Person_ID AS person_id,
>   Profile.Name AS name,
>   ARRAY_LENGTH(papers.disambiguated_pubmed_ids) AS pubmed_count,
>   ARRAY_LENGTH(editor_pubmed_links.relevant_pubmed_urls) AS relevant_pubmed_url_count,
>   ARRAY_LENGTH(editor_pubmed_links.relevant_pubmed_ids) AS relevant_pubmed_id_count,
>   ARRAY_LENGTH(editor_pubmed_ids.pubmed_ids) AS total_pubmed_id_count,
>   editor_pubmed_links.pubmed_url,
>   editor_pubmed_links.search_term
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile` AS Profile
> LEFT JOIN `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers` AS papers
>   ON papers.person_id = Profile.Person_ID
> LEFT JOIN `elife-data-pipeline.de_dev.data_science_editor_pubmed_links` AS editor_pubmed_links
>   ON editor_pubmed_links.person_id = Profile.Person_ID
> LEFT JOIN t_editor_pubmed_ids AS editor_pubmed_ids
>   ON editor_pubmed_ids.person_id = Profile.Person_ID
> WHERE Profile.Name IS NOT NULL
> ORDER BY Profile.Name
> ```

Downloading: 100%|██████████| 633/633 [00:01<00:00, 619.66rows/s]

633





Unnamed: 0,person_id,name,pubmed_count,relevant_pubmed_url_count,relevant_pubmed_id_count,total_pubmed_id_count,pubmed_url,search_term
0,3357,Abby Dernburg,57.0,5.0,5.0,58.0,https://www.ncbi.nlm.nih.gov/pubmed/?term=Dern...,Dernburg AF[Author]
1,33041,Adam Aron,93.0,0.0,0.0,94.0,https://www.ncbi.nlm.nih.gov/pubmed/?term=Aron...,Aron AR[Author]
2,86941,Adam Frost,53.0,0.0,0.0,677.0,https://www.ncbi.nlm.nih.gov/pubmed?term=Frost...,Frost A


In [7]:
with pd.option_context("display.max_rows", 1000):
    print(editor_pubmed_count_df.drop(columns={'relevant_pubmed_url_count'}).to_string(index=False))

person_id                              name  pubmed_count  relevant_pubmed_id_count  total_pubmed_id_count                                                                                                               pubmed_url                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [8]:
print('editors with pubmed urls without parsed pubmed id:\n%s' % editor_pubmed_count_df[
    editor_pubmed_count_df['relevant_pubmed_url_count'] > editor_pubmed_count_df['relevant_pubmed_id_count']
][['person_id', 'name', 'relevant_pubmed_url_count', 'relevant_pubmed_id_count']].to_string(index=False))

editors with pubmed urls without parsed pubmed id:
person_id               name  relevant_pubmed_url_count  relevant_pubmed_id_count
   178962       Chima Nwaogu                        3.0                       1.0
   166136     Irene Giardina                        6.0                       4.0
     5115      Kenton Swartz                        5.0                       4.0
    19580  Pekka Lappalainen                        4.0                       3.0
   151091          Qiang Cui                        1.0                       0.0
   140825       Zsolt Molnár                        6.0                       5.0


In [9]:
print(
    'editors without disambiguated pubmed papers despite having relevant pubmed ids:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] == 0)
            & (editor_pubmed_count_df['relevant_pubmed_id_count'] > 0)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'total_pubmed_id_count']]
        .to_string(index=False)
    )
)

editors without disambiguated pubmed papers despite having relevant pubmed ids:
Empty DataFrame
Columns: [person_id, name, pubmed_count, relevant_pubmed_id_count, total_pubmed_id_count]
Index: []


In [10]:
print(
    'editors with less than five disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (editor_pubmed_count_df['pubmed_count'] < 5)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'total_pubmed_id_count']]
        .to_string(index=False)
    )
)

editors with less than five disambiguated pubmed papers:
person_id                name  pubmed_count  relevant_pubmed_id_count  total_pubmed_id_count
    27913     Alphee Michelot           3.0                       3.0                    0.0
   178962        Chima Nwaogu           4.0                       1.0                    4.0
    64899  Cynthia Czajkowski           3.0                       0.0                    3.0
    52814          Jeremy Day           4.0                       4.0                    0.0
     3645    Karla Kirkegaard           1.0                       0.0                    1.0
    56276       Kavitha Sarma           3.0                       3.0                    0.0


In [11]:
print(
    'editors without additional disambiguated pubmed papers (apart from relevant pubmed ids):\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] <= editor_pubmed_count_df['relevant_pubmed_id_count'])
            & (
                editor_pubmed_count_df['total_pubmed_id_count']
                > editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors without additional disambiguated pubmed papers (apart from relevant pubmed ids):
Empty DataFrame
Columns: [person_id, name, relevant_pubmed_id_count, total_pubmed_id_count, search_term]
Index: []


In [12]:
print(
    'editors with only relevant pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (
                editor_pubmed_count_df['total_pubmed_id_count']
                <= editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors with only relevant pubmed papers:
person_id                name  relevant_pubmed_id_count  total_pubmed_id_count              search_term
    52199        Alex Fornito                       5.0                    0.0    Alex Fornito [Author]
    27913     Alphee Michelot                       3.0                    0.0              Michelot, A
   117606        Amita Sehgal                       5.0                    3.0                 Seghal A
   168470    Benjamin Prosser                       5.0                    0.0              Prosser, BL
     7527          David Drew                       5.0                    0.0               DREW D[au]
    10241    Ivan Topisirovic                       5.0                    0.0  "Topisirovic I"[Author]
    52814          Jeremy Day                       4.0                    0.0                  Day, JJ
    56276       Kavitha Sarma                       3.0                    0.0   Sarma, Kavitha[Author]
    16148          Meg

In [13]:
print(
    'editors without any disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            editor_pubmed_count_df['pubmed_count'] == 0
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

editors without any disambiguated pubmed papers:
person_id               name  relevant_pubmed_id_count  total_pubmed_id_count               search_term
     3857   Christopher Hill                       0.0                    0.0                   Hill, C
     1168       Louis Ptáček                       0.0                    0.0              Ptáček L[au]
    40488  P Robin Hiesinger                       0.0                    0.0  Hiesinger Robin [Author]
   116459    Paola Bovolenta                       0.0                    0.0      Bovolenta P [Author]
   151091          Qiang Cui                       0.0                    0.0                    Cui, Q
   118253    Teresa Giraldez                       0.0                    0.0               Giraldez, T
