# Created a merged corresponding authors dataset

Identifies corresponding authors from PubMed Central author list if available. Otherwise, use PubMed author list.

In [1]:
import pandas

In [2]:
pubmed_author_df = pandas.read_csv("data/pubmed/authors.tsv.xz", sep="\t")
pubmed_author_df.head(2)

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,9520496,1,B A,Eckman,7
1,9520496,2,J S,Aaronson,6


In [3]:
pmc_author_df = pandas.read_csv("data/pmc/authors.tsv.xz", sep="\t")
pmc_author_df.head(2)

Unnamed: 0,pmcid,position,fore_name,last_name,corresponding,reverse_position
0,PMC100321,1,Alexander E,Pozhitkov,1,2
1,PMC100321,2,Diethard,Tautz,0,1


In [4]:
pubmed_df = pandas.read_csv("data/pubmed/articles.tsv.xz", sep="\t")
pubmed_df.head(2)

Unnamed: 0,pmid,pmcid,doi,journal,publication_date,title
0,9520496,,10.1093/bioinformatics/14.1.2,Bioinformatics,1998,The Merck Gene Index browser: an extensible da...
1,9520497,,10.1093/bioinformatics/14.1.14,Bioinformatics,1998,Algorithms and software for support of gene id...


In [5]:
def get_corresponding(df):
    if df.corresponding.any():
        df = df.query("corresponding == 1")
        df['use_last'] = 0
    else:
        df = df.query("reverse_position == 1")
        df['use_last'] = 1
    return df.assign(n_corresponding=len(df))

pmc_corresp_df = (
    pubmed_df[['pmid', 'pmcid']]
    .merge(pmc_author_df)
    .groupby('pmid')
    .apply(get_corresponding)
)
pmc_corresp_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,pmid,pmcid,position,fore_name,last_name,corresponding,reverse_position,use_last,n_corresponding
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11001586,0,11001586,PMC29061,1,Jeremy S,Edwards,1,2,0,2
11001586,1,11001586,PMC29061,2,Bernhard O,Palsson,1,1,0,2


In [6]:
pubmed_corresp_df = (
    pubmed_author_df
    .query("pmid not in @pmc_corresp_df.pmid")
    .assign(corresponding=None)
    .groupby('pmid')
    .apply(get_corresponding)
)
pubmed_corresp_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,pmid,position,fore_name,last_name,reverse_position,corresponding,use_last,n_corresponding
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9520496,6,9520496,7,R A,Blevins,1,,1,1
9520497,12,9520497,6,P A,Pevzner,1,,1,1


In [7]:
corresp_df = pandas.concat([pmc_corresp_df, pubmed_corresp_df], sort=False).reset_index(drop=True)
fore_df = pandas.read_csv("data/names/fore-names.tsv.xz", sep="\t")
last_df = pandas.read_csv("data/names/last-names.tsv.xz", sep="\t")
corresp_df = (
    corresp_df
    .merge(fore_df[["fore_name", "fore_name_simple"]], how="left")
    .merge(last_df[["last_name", "last_name_simple"]], how="left")
)
corresp_df.head(2)

Unnamed: 0,pmid,pmcid,position,fore_name,last_name,corresponding,reverse_position,use_last,n_corresponding,fore_name_simple,last_name_simple
0,11001586,PMC29061,1,Jeremy S,Edwards,1,2,0,2,jeremy,edwards
1,11001586,PMC29061,2,Bernhard O,Palsson,1,1,0,2,bernhard,palsson


In [8]:
corresp_df.to_csv('data/names/corresponding-authors.tsv.xz', sep='\t', index=False)