# Extract authors from PMC-OAI frontmatter `<article>` records

In [1]:
import pathlib
import zipfile

import pandas
import lxml.etree

from pubmedpy.xml import yield_etrees_from_zip
from pubmedpy.pmc_oai import extract_authors_from_article

In [2]:
zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))
zip_paths

[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),
 PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),
 PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]

In [3]:
authors = list()
for zip_path in zip_paths:
    for name, article in yield_etrees_from_zip(zip_path):
        authors.extend(extract_authors_from_article(article))
author_df = pandas.DataFrame(authors)
author_df = author_df.sort_values(['pmcid', 'position'])
author_df.tail()

Unnamed: 0,pmcid,position,fore_name,last_name,corresponding,reverse_position
23965,PMC77394,2,Ferdinando Di,Cunto,0,2
23966,PMC77394,3,Paolo,Provero,1,1
23967,PMC90187,1,Jonas S,Almeida,1,2
23968,PMC90187,2,Susana,Vinga,0,1
23969,PMC99049,1,Harry J,Mangalam,1,1


In [4]:
# Total number of articles
author_df.pmcid.nunique()

21411

In [5]:
# Corresponding author counts
author_df.groupby('pmcid').corresponding.sum().value_counts().sort_index()

0       371
1     17401
2      3223
3       311
4        61
5        19
6         7
7         2
8         2
9         6
10        2
11        1
14        2
15        1
17        1
21        1
Name: corresponding, dtype: int64

In [6]:
# Testing: show some articles without any corresponding authors
author_df.groupby('pmcid').corresponding.sum().reset_index().query("corresponding == 0").head()

Unnamed: 0,pmcid,corresponding
105,PMC1183510,0
106,PMC1183511,0
107,PMC1183512,0
119,PMC1185644,0
160,PMC1193992,0


In [7]:
# Write author dataframe to a TSV
author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\t')