# Create a dataframe of DOIs

In [1]:
import os
import lzma
import csv

import pymongo

In [2]:
client = pymongo.MongoClient('localhost', 27017)
crossref_db = client.crossref
works = crossref_db.works

In [3]:
f'{works.count():,}'

'87,542,370'

In [4]:
def get_issued_date(work):
    """
    Get issued date from a work object, which is the "Eariest of published-print
    and published-online" (https://git.io/vSBPz).
    """
    issued, = work['issued']['date-parts']
    if issued[0] is None:
        return None
    issued = '-'.join('{:02d}'.format(part) for part in issued)
    return issued

In [5]:
# Write a dois and issn mapping to a TSV file
doi_path = os.path.join('data', 'doi.tsv.xz')
issn_path = os.path.join('data', 'doi-to-issn.tsv.xz')
with lzma.open(doi_path, 'wt') as doi_file, lzma.open(issn_path, 'wt') as issn_file:
    # Initialize TSV of DOIs
    doi_writer = csv.writer(doi_file, delimiter='\t')
    doi_writer.writerow(['doi', 'type', 'issued'])
    # Initialize TSV of DOI-ISSN mapping
    issn_writer = csv.writer(issn_file, delimiter='\t')
    issn_writer.writerow(['doi', 'issn'])
    # Iterate through works
    for work in works.find():
        doi = work['DOI']
        issued = get_issued_date(work)
        doi_writer.writerow((doi, work['type'], issued))
        for issn in work.get('ISSN', []):
            # Hyphenation not needed
            if '-' not in issn:
                # Hyphenate ISSNs
                issn = '{}{}{}{}-{}{}{}{}'.format(*issn)
            issn_writer.writerow((doi, issn))

In [6]:
client.close()