# Create a DOI-to-DOI citation catalog

In [1]:
import pathlib
import re
import urllib.parse

import pandas

## Create a refined DOI to bibligraphic resource mapping

In [2]:
path = pathlib.Path('data/identifiers.tsv.xz')
identifier_df = pandas.read_table(path, compression='xz')
identifier_df.head()

Unnamed: 0,br_iri,type,id
0,gbr:1,doi,10.1097/igc.0000000000000609
1,gbr:1,pmcid,PMC4679344
2,gbr:1,pmid,26645990
3,gbr:1,url,http://dx.doi.org/10.1097/igc.0000000000000609
4,gbr:3,issn,1048-891X


In [3]:
doi_pattern = re.compile(r'10\.\d{4,9}/\S+')

doi_url_pattern = re.compile(r"https?://(?:dx\.)?doi\.org/(10\..+)")

def doi_sanitize(doi):
    """
    Fix common DOI formatting errors
    """
    doi = doi.lower()
    if doi.startswith('doi:'):
        doi = doi[4:]
    # Check for DOIs as URLs
    match = doi_url_pattern.match(doi)
    if match:
        doi = match.group(1)
        doi = urllib.parse.unquote(doi)
    # Quality control
    if doi_pattern.fullmatch(doi):
        return doi
    return None

In [4]:
doi_df = (identifier_df
    .query("type == 'doi' or type == 'url'")
    .drop('type', axis='columns')
    .rename(columns={'id': 'doi'})
)

# Ensure all DOIs are lowercase
doi_df.doi = doi_df.doi.map(doi_sanitize)
doi_df = doi_df.dropna().drop_duplicates()
doi_df.head()

Unnamed: 0,br_iri,doi
0,gbr:1,10.1097/igc.0000000000000609
6,gbr:6,10.1097/01.aog.0000255980.88205.15
11,gbr:11,10.1097/00006250-200508000-00050
14,gbr:14,10.1038/nature12113
21,gbr:18,10.1038/modpathol.3800620


In [5]:
duplicate_doi_df = doi_df[doi_df.doi.duplicated(keep=False)]
f'{len(duplicate_doi_df):,} bibliographic resources have duplicate DOIs'

'2,049 bibliographic resources have duplicate DOIs'

## Create a DOI citation catalog

In [6]:
path = pathlib.Path('data/citations.tsv.xz')
citation_df = pandas.read_table(path, compression='xz')
citation_df.head(3)

Unnamed: 0,source,target
0,gbr:1,gbr:5
1,gbr:1,gbr:6
2,gbr:1,gbr:10


In [7]:
doi_citation_df = (doi_df
    .rename(columns={'br_iri': 'source', 'doi': 'source_doi'})
    .merge(citation_df)
    .merge(doi_df.rename(columns={'br_iri': 'target', 'doi': 'target_doi'}))
    [['source_doi', 'target_doi']]
    .drop_duplicates()
    .sort_values(['source_doi', 'target_doi'])
    # Remove DOI self-cites which are likely errors
    .query("source_doi != target_doi")
)
doi_citation_df.head()

Unnamed: 0,source_doi,target_doi
3778207,10.1002/14651858.cd002244.pub4,10.1001/archneur.1990.00530120057010
3778223,10.1002/14651858.cd002244.pub4,10.1002/14651858.cd002244
3778224,10.1002/14651858.cd002244.pub4,10.1002/14651858.cd002244.pub2
1663454,10.1002/14651858.cd002244.pub4,10.1002/14651858.cd002244.pub3
3778219,10.1002/14651858.cd002244.pub4,10.1007/s003810000427


In [8]:
print(f'''
{len(doi_citation_df):,} total DOI-to-DOI citations
{doi_citation_df.source_doi.nunique():,} DOIs with outgoing DOI citations
{doi_citation_df.target_doi.nunique():,} DOIs with incoming DOI citations
'''.strip())

7,574,387 total DOI-to-DOI citations
203,264 DOIs with outgoing DOI citations
3,946,611 DOIs with incoming DOI citations


In [9]:
path = pathlib.Path('data/citations-doi.tsv.xz')
doi_citation_df.to_csv(path, compression='xz', index=False, sep='\t')