# Created a merged corresponding authors dataset

Identifies corresponding authors from PubMed Central author list if available. Otherwise, use PubMed author list.

In [1]:
import pathlib
import pandas
import pubmedpy

## Prepare country assignment

In [2]:
# read affiliations to country mapping
country_df = pandas.read_csv("data/affiliations/countries.tsv.xz", sep='\t', keep_default_na=False)
# be careful reading country codes: Namibia code is NA
assert country_df.notna().all(axis=None)
country_df.head()

Unnamed: 0,affiliation,country
0,"""Athena"" Research and Innovation Center, Athen...",GR
1,"""Athena"" Research and Innovation Center, Athen...",GR
2,"""Claudio Munari"" Center for Epilepsy Surgery, ...",IT
3,"""Momentum"" Membrane Protein Bioinformatics Res...",HU
4,"""Momentum"" Membrane Protein Bioinformatics Res...",HU


In [3]:
source = 'pubmed'
directory = pathlib.Path('data') / source
affil_df = pandas.read_csv(directory / "affiliations.tsv.xz", sep="\t")
affil_df = affil_df.merge(country_df)


In [4]:
def read_authors(source: str) -> pandas.DataFrame:
    """
    Read authors table for source, and add a countries column.
    """
    directory = pathlib.Path('data') / source
    # read authors
    author_df = pandas.read_csv(directory / "authors.tsv.xz", sep="\t")
    # read author affiliations
    affil_df = pandas.read_csv(directory / "affiliations.tsv.xz", sep="\t")
    affil_df = affil_df.merge(country_df)
    # assign countries to authors by affiliations
    primary_key = {"pubmed": "pmid", "pmc": "pmcid"}[source]
    countries_df = (
        affil_df
        .groupby([primary_key, "position"])
        .country
        .apply(lambda x: ",".join(sorted(set(x))))
        .reset_index()
        .rename(columns={"country": "countries"})
    )
    author_df = author_df.merge(countries_df, how='left')
    return author_df

## Read pubmed authors

In [5]:
pubmed_df = pandas.read_csv("data/pubmed/articles.tsv.xz", sep="\t")
pubmed_df.head(2)

Unnamed: 0,pmid,pmcid,doi,journal,publication_date,pmc_cited_by_count,title
0,7477412,,10.1038/378516a0,Nature,1995-11-30,2,Mapping the genome one molecule at a time--opt...
1,7479891,PMC40523,10.1073/pnas.92.24.10821,Proc Natl Acad Sci U S A,1995-11-21,4,Ahead of schedule and under budget: the Genome...


In [6]:
pubmed_df.head()

Unnamed: 0,pmid,pmcid,doi,journal,publication_date,pmc_cited_by_count,title
0,7477412,,10.1038/378516a0,Nature,1995-11-30,2,Mapping the genome one molecule at a time--opt...
1,7479891,PMC40523,10.1073/pnas.92.24.10821,Proc Natl Acad Sci U S A,1995-11-21,4,Ahead of schedule and under budget: the Genome...
2,7479895,PMC40527,10.1073/pnas.92.24.10841,Proc Natl Acad Sci U S A,1995-11-21,11,"How is the Human Genome Project doing, and wha..."
3,7497116,,10.1089/cmb.1995.2.139,J Comput Biol,1995,1,Four strikes against physical mapping of DNA.
4,7497128,,10.1089/cmb.1995.2.219,J Comput Biol,1995,2,Physical mapping by STS hybridization: algorit...


In [7]:
pubmed_author_df = read_authors("pubmed")
pubmed_author_df.head(2)

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position,countries
0,7477412,1,A H,Samad,10,US
1,7477412,2,W W,Cai,9,


In [8]:
# pmc_author_df = read_authors("pmc")
# pmc_author_df.head(2)

In [9]:
# def get_corresponding(df):
# #     if df.corresponding.any():
# #         df = df.query("corresponding == 1")
# #         df['use_last'] = 0
# #     else:
#     df = df.query("reverse_position == 1")
#     df['use_last'] = 1
#     return df.assign(n_corresponding=len(df))

# pmc_corresp_df = (
#     pubmed_df[['pmid', 'pmcid']]
#     .merge(pmc_author_df)
#     .assign(source="pmc")
#     .groupby('pmid')
#     .apply(get_corresponding)
# )
# pmc_corresp_df.head(2)

In [10]:
# pmc_corresp_df.countries.notna().mean()

In [11]:
pubmed_author_df.query('reverse_position == 1')

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position,countries
9,7477412,10,B,Porter,1,
10,7479891,1,F S,Collins,1,US
12,7479895,2,F S,Collins,1,
16,7497116,4,R,Shamir,1,
18,7497128,2,S,Istrail,1,
...,...,...,...,...,...,...
1103509,32612807,5,Tarcisio,Mendes de Farias,1,CH
1103517,32760576,8,Matthew,McAuliffe,1,US
1103521,32913631,4,Joshua W K,Ho,1,"AU,HK,US"
1103527,33016592,6,Charles,Chen,1,US


In [12]:
# pubmed_corresp_df.countries.notna().mean()

In [13]:
# corresp_df = pandas.concat([pmc_corresp_df, pubmed_corresp_df], sort=False).reset_index(drop=True)
corresp_df = (pubmed_author_df
    .query('reverse_position == 1')
    .assign(source='pubmed', use_last = 1))
fore_df = pandas.read_csv("data/names/fore-names.tsv.xz", sep="\t", keep_default_na=False)
last_df = pandas.read_csv("data/names/last-names.tsv.xz", sep="\t", keep_default_na=False)
corresp_df = (
    corresp_df
    .merge(fore_df[["fore_name", "fore_name_simple"]], how="left")
    .merge(last_df[["last_name", "last_name_simple"]], how="left")
)
corresp_df.head(2)

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position,countries,source,use_last,fore_name_simple,last_name_simple
0,7477412,10,B,Porter,1,,pubmed,1,,porter
1,7479891,1,F S,Collins,1,US,pubmed,1,,collins


In [14]:
# number of authors with 1+ assinged countries by source
# False indicates no assigned countries.
# True indicates one or more countries
tab = pandas.crosstab(
    corresp_df.source,
    corresp_df.countries.notnull(),
    margins=True,
)

In [15]:
# percent of authors with 1+ assinged countries by source
pandas.crosstab(
    corresp_df.source,
    corresp_df.countries.notnull(),
    margins=True, normalize="index"
).applymap("{:.1%}".format)

countries,False,True
source,Unnamed: 1_level_1,Unnamed: 2_level_1
pubmed,52.6%,47.4%
All,52.6%,47.4%


In [16]:
pmc_country = (
    pubmed_df
    .merge(corresp_df[['pmid', 'source', 'countries']].drop_duplicates())
)
pmc_country.head()

Unnamed: 0,pmid,pmcid,doi,journal,publication_date,pmc_cited_by_count,title,source,countries
0,7477412,,10.1038/378516a0,Nature,1995-11-30,2,Mapping the genome one molecule at a time--opt...,pubmed,
1,7479891,PMC40523,10.1073/pnas.92.24.10821,Proc Natl Acad Sci U S A,1995-11-21,4,Ahead of schedule and under budget: the Genome...,pubmed,US
2,7479895,PMC40527,10.1073/pnas.92.24.10841,Proc Natl Acad Sci U S A,1995-11-21,11,"How is the Human Genome Project doing, and wha...",pubmed,
3,7497116,,10.1089/cmb.1995.2.139,J Comput Biol,1995,1,Four strikes against physical mapping of DNA.,pubmed,
4,7497128,,10.1089/cmb.1995.2.219,J Comput Biol,1995,2,Physical mapping by STS hybridization: algorit...,pubmed,


If available, collect PMCID of articles with no countries to query more affiliations from:

In [17]:
pmcids_query = (pmc_country.query('(countries != countries) and (pmcid == pmcid)'))['pmcid']

In [18]:
pmcids_query

2           PMC40527
20          PMC41521
28         PMC225990
48        PMC1336852
67          PMC44177
             ...    
179332    PMC7184535
179388    PMC6986235
179391    PMC6986237
179394    PMC7390749
179454    PMC6426537
Name: pmcid, Length: 37646, dtype: object

In [19]:
def get_frontmatter_etree_via_api(pmcid):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:{pmcid[3:]}&metadataPrefix=pmc_fm"
    response = requests.get(url)
    tree = etree.fromstring(response.content)
    article = tree.find("{*}GetRecord/{*}record/{*}metadata/{*}article")
    return article

In [20]:
from pubmedpy.pmc_oai import extract_authors_from_article
# from pubmedpy.tests.test_pmc_oai import get_frontmatter_etree_via_api
from lxml import etree
# from lxml.etree import tostring
import requests

In [21]:
art = get_frontmatter_etree_via_api('PMC6986237')
extract_authors_from_article(art)

[{'pmcid': 'PMC6986237',
  'position': 1,
  'fore_name': 'Megan E.',
  'last_name': 'Barefoot',
  'corresponding': 0,
  'reverse_position': 6,
  'affiliations': []},
 {'pmcid': 'PMC6986237',
  'position': 2,
  'fore_name': 'Rency S.',
  'last_name': 'Varghese',
  'corresponding': 0,
  'reverse_position': 5,
  'affiliations': []},
 {'pmcid': 'PMC6986237',
  'position': 3,
  'fore_name': 'Yuan',
  'last_name': 'Zhou',
  'corresponding': 0,
  'reverse_position': 4,
  'affiliations': []},
 {'pmcid': 'PMC6986237',
  'position': 4,
  'fore_name': 'Cristina',
  'last_name': 'Di Poto',
  'corresponding': 0,
  'reverse_position': 3,
  'affiliations': []},
 {'pmcid': 'PMC6986237',
  'position': 5,
  'fore_name': 'Alessia',
  'last_name': 'Ferrarini',
  'corresponding': 0,
  'reverse_position': 2,
  'affiliations': []},
 {'pmcid': 'PMC6986237',
  'position': 6,
  'fore_name': 'Habtom W.',
  'last_name': 'Ressom',
  'corresponding': 0,
  'reverse_position': 1,
  'affiliations': []}]

In [22]:
art = get_frontmatter_etree_via_api('PMC6986235')
print(etree.tostring(art, encoding = 'unicode'))

<article xmlns="https://jats.nlm.nih.gov/ns/archiving/1.2/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="https://jats.nlm.nih.gov/ns/archiving/1.2/ https://jats.nlm.nih.gov/archiving/1.2/xsd/JATS-archivearticle1.xsd" article-type="research-article">
  <front>
    <journal-meta>
      <journal-id journal-id-type="nlm-journal-id">101243413</journal-id>
      <journal-id journal-id-type="pubmed-jr-id">32722</journal-id>
      <journal-id journal-id-type="nlm-ta">Conf Proc IEEE Eng Med Biol Soc</journal-id>
      <journal-id journal-id-type="iso-abbrev">Conf Proc IEEE Eng Med Biol Soc</journal-id>
      <journal-title-group>
        <journal-title>Conference proceedings : ... Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual Confe

In [23]:
extract_authors_from_article(art)

[{'pmcid': 'PMC6986235',
  'position': 1,
  'fore_name': 'Ziling',
  'last_name': 'Fan',
  'corresponding': 0,
  'reverse_position': 3,
  'affiliations': []},
 {'pmcid': 'PMC6986235',
  'position': 2,
  'fore_name': 'Yuan',
  'last_name': 'Zhou',
  'corresponding': 0,
  'reverse_position': 2,
  'affiliations': []},
 {'pmcid': 'PMC6986235',
  'position': 3,
  'fore_name': 'Habtom W.',
  'last_name': 'Ressom',
  'corresponding': 0,
  'reverse_position': 1,
  'affiliations': []}]

### Analyze sources for corresponding authors

In [24]:
source_df = (
    pubmed_df
    .merge(corresp_df[['pmid', 'source', 'use_last']].drop_duplicates())
)
source_df.head(2)

Unnamed: 0,pmid,pmcid,doi,journal,publication_date,pmc_cited_by_count,title,source,use_last
0,7477412,,10.1038/378516a0,Nature,1995-11-30,2,Mapping the genome one molecule at a time--opt...,pubmed,1
1,7479891,PMC40523,10.1073/pnas.92.24.10821,Proc Natl Acad Sci U S A,1995-11-21,4,Ahead of schedule and under budget: the Genome...,pubmed,1


In [25]:
# number of articles by authorship source by journal
pandas.crosstab(source_df.journal, source_df.source, margins=True)

source,pubmed,All
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
A A Case Rep,1,1
AACN Adv Crit Care,6,6
AACN Clin Issues,2,2
AAOHN J,1,1
AAPS J,29,29
...,...,...
Zygote,2,2
eNeuro,8,8
mBio,204,204
mSphere,37,37


In [26]:
# number of articles by authorship source by year
pandas.crosstab(source_df.publication_date.str.slice(0, 4), source_df.source, margins=True)

source,pubmed,All
publication_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1993,84,84
1994,67,67
1995,63,63
1996,150,150
1997,135,135
1998,327,327
1999,349,349
2000,716,716
2001,1209,1209
2002,1849,1849


In [27]:
# Number of articles where all corresponding authors have assigned countries by journal
articles_with_na_countries = set(corresp_df.loc[corresp_df.countries.isna()].pmid)
source_df["corresp_has_countries"] = ~source_df.pmid.isin(articles_with_na_countries)
pandas.crosstab(source_df.journal, source_df.corresp_has_countries, margins=True)

corresp_has_countries,False,True,All
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A A Case Rep,1,0,1
AACN Adv Crit Care,0,6,6
AACN Clin Issues,2,0,2
AAOHN J,0,1,1
AAPS J,21,8,29
...,...,...,...
Zygote,1,1,2
eNeuro,0,8,8
mBio,45,159,204
mSphere,0,37,37


In [28]:
# Percent of articles where all corresponding authors have assigned countries by journal
pandas.crosstab(
    source_df.journal, source_df.corresp_has_countries,
    margins=True, normalize="index"
).applymap("{:.1%}".format)

corresp_has_countries,False,True
journal,Unnamed: 1_level_1,Unnamed: 2_level_1
A A Case Rep,100.0%,0.0%
AACN Adv Crit Care,0.0%,100.0%
AACN Clin Issues,100.0%,0.0%
AAOHN J,0.0%,100.0%
AAPS J,72.4%,27.6%
...,...,...
Zygote,50.0%,50.0%
eNeuro,0.0%,100.0%
mBio,22.1%,77.9%
mSphere,0.0%,100.0%


In [29]:
corresp_df.to_csv('data/names/corresponding-authors.tsv.xz', sep='\t', index=False)