# Extract authors from PMC-OAI frontmatter `<article>` records

In [1]:
import pathlib
import zipfile

import pandas
import lxml.etree

In [2]:
def yield_etrees_from_zip(path):
    """
    Read members of a zip file with an `.xml` extension.
    """
    with zipfile.ZipFile(path) as zip_file:
        for name in zip_file.namelist():
            if not name.endswith('.xml'):
                continue
            with zip_file.open(name) as read_file:
                element_tree = lxml.etree.parse(read_file)
                yield name, element_tree

In [3]:
zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))
zip_paths

[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),
 PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),
 PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]

In [4]:
def extract_authors_from_article(article):
    pmcid = article.findtext("{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']")
    contrib_elems = article.findall("{*}front/{*}article-meta/{*}contrib-group/{*}contrib[@contrib-type='author']")
    authors = []
    for i, contrib_elem in enumerate(contrib_elems):
        corresponding = (
            contrib_elem.find("{*}xref[@ref-type='corresp']") is not None
            or contrib_elem.get('corresp', 'no') == 'yes'
        )
        authors.append({
            'pmcid': pmcid,
            'position': i + 1,
            'fore_name': contrib_elem.findtext("{*}name/{*}given-names"),
            'last_name': contrib_elem.findtext("{*}name/{*}surname"),
            'corresponding': int(corresponding),
            'reverse_position': len(contrib_elems) - i,
        })
    return authors

In [5]:
authors = list()
for zip_path in zip_paths:
    for name, article in yield_etrees_from_zip(zip_path):
        authors.extend(extract_authors_from_article(article))
author_df = pandas.DataFrame(authors)
author_df = author_df.sort_values(['pmcid', 'position'])
author_df.tail()

Unnamed: 0,pmcid,position,fore_name,last_name,corresponding,reverse_position
22840,PMC77394,2,Ferdinando Di,Cunto,0,2
22841,PMC77394,3,Paolo,Provero,1,1
22842,PMC90187,1,Jonas S,Almeida,1,2
22843,PMC90187,2,Susana,Vinga,0,1
22844,PMC99049,1,Harry J,Mangalam,1,1


In [6]:
# Total number of articles
author_df.pmcid.nunique()

20880

In [7]:
# Corresponding author counts
author_df.groupby('pmcid').corresponding.sum().value_counts().sort_index()

0       355
1     16997
2      3127
3       298
4        59
5        19
6         7
7         2
8         2
9         6
10        2
11        1
14        2
15        1
17        1
21        1
Name: corresponding, dtype: int64

In [8]:
# Testing: show some articles without any corresponding authors
author_df.groupby('pmcid').corresponding.sum().reset_index().query("corresponding == 0").head()

Unnamed: 0,pmcid,corresponding
105,PMC1183510,0
106,PMC1183511,0
107,PMC1183512,0
119,PMC1185644,0
160,PMC1193992,0


In [9]:
# Write author dataframe to a TSV
author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\t')