In [None]:
project_id = 'elife-data-pipeline'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
target_paper_count = 50
# max_paper_count is ignore if it is a good match
max_paper_count = 2000

In [None]:
import logging
import sys

import pandas as pd

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import run_query_and_save_to_table, get_client
from data_science_pipeline.utils.jupyter import printmd, to_markdown_sql, read_big_query

In [None]:
query_name = 'disambiguated_editor_papers'
destination_table_name = ''.join([output_table_prefix, query_name])

In [None]:
logging.basicConfig(level='INFO', stream=sys.stdout)

In [None]:
print('processing %s' % query_name)
_sql = get_sql('%s.sql' % query_name).format(
    project=project_id,
    dataset=output_dataset,
    target_paper_count=target_paper_count,
    max_paper_count=max_paper_count
)
printmd(to_markdown_sql(_sql))
run_query_and_save_to_table(
    client=get_client(project_id=project_id),
    query=_sql,
    destination_dataset=output_dataset,
    destination_table_name=destination_table_name
)
print('done')

In [None]:
_sql = get_sql('disambiguated_editor_papers_count.sql').format(
    project=project_id,
    dataset=output_dataset
)
editor_pubmed_count_df = read_big_query(_sql)
print(len(editor_pubmed_count_df))
editor_pubmed_count_df.head(3)

In [None]:
with pd.option_context("display.max_rows", 1000):
    print(editor_pubmed_count_df.drop(columns={'relevant_pubmed_url_count'}).to_string(index=False))

In [None]:
print('editors with pubmed urls without parsed pubmed id:\n%s' % editor_pubmed_count_df[
    editor_pubmed_count_df['relevant_pubmed_url_count'] > editor_pubmed_count_df['relevant_pubmed_id_count']
][['person_id', 'name', 'relevant_pubmed_url_count', 'relevant_pubmed_id_count']].to_string(index=False))

In [None]:
print(
    'editors without disambiguated pubmed papers despite having relevant pubmed ids:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] == 0)
            & (editor_pubmed_count_df['relevant_pubmed_id_count'] > 0)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'total_pubmed_id_count']]
        .to_string(index=False)
    )
)

In [None]:
print(
    'editors with less than five disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (editor_pubmed_count_df['pubmed_count'] < 5)
        ]
        [['person_id', 'name', 'pubmed_count', 'relevant_pubmed_id_count', 'total_pubmed_id_count']]
        .to_string(index=False)
    )
)

In [None]:
print(
    'editors without additional disambiguated pubmed papers (apart from relevant pubmed ids):\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] <= editor_pubmed_count_df['relevant_pubmed_id_count'])
            & (
                editor_pubmed_count_df['total_pubmed_id_count']
                > editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

In [None]:
print(
    'editors with only relevant pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            (editor_pubmed_count_df['pubmed_count'] > 0)
            & (
                editor_pubmed_count_df['total_pubmed_id_count']
                <= editor_pubmed_count_df['relevant_pubmed_id_count']
            )
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)

In [None]:
print(
    'editors without any disambiguated pubmed papers:\n%s' % (
        editor_pubmed_count_df[
            editor_pubmed_count_df['pubmed_count'] == 0
        ]
        [['person_id', 'name', 'relevant_pubmed_id_count', 'total_pubmed_id_count', 'search_term']]
        .to_string(index=False)
    )
)