### Get DOIs and Citations
Given a file of raw "citations" (i.e., they need not be properly formatted), with one blank line separating each,

1. query the CrossRef API to match a DOI to each "citation",
2. allow inspection of match results and manual assignment of DOIs,
3. add verified DOIs to a stored set (e.g. "center" DOIs vs. "user" DOIs), and
4. add to / update a set-specific file of properly-formatted citations

#### Pull in citations

In [None]:
# 'center' for publications from MP center,
# 'users' for publications from MP users.     

CITATION_SET = 'user'
dois = set()

In [None]:
# Grab raw citations and store as array of strings

RAW_CITATIONS_FNAME = 'raw-citations-user-1.txt'
raw_citations = []
with open(RAW_CITATIONS_FNAME) as f:
    citation = ""
    for line in f:
        if line.strip():
            # Add line to current citation
            citation += line.strip() + " "
        elif citation:
            # Save citation and start reading new one
            raw_citations.append(citation)
            citation = ""

#### Query the CrossRef API for DOIs

In [None]:
# Query the CrossRef API to match a DOI to each raw citation
# Shouldn't take more than ~10 s

import json
import requests
r = requests.post("http://search.crossref.org/links",
                         data=json.dumps(raw_citations))
response = json.loads(r.text)

#### Inspect results, add DOIs as desired

In [None]:
# Allow inspection of match results

results = [{k: v for k, v in result.items() if k in ['doi', 'match', 'score', 'text']}
           for result in response['results']]
for n, r in enumerate(results):
    r['n'] = n

In [None]:
# Print results for which no DOI matched.

# Add doi to result. It's fine if 'match' is still False.
#results[12]['doi'] = "http://dx.doi.org/" + '10.1039/C4CP02091J'.lower()

for r in results:
    if not r['match']:
        pprint(r)

#### Add verified DOIs to stored set

In [None]:
# Retrieve stored set of DOIs

import os.path

dois = set()

fname = 'dois-{}.txt'.format(CITATION_SET)
if os.path.isfile(fname):
    with open(fname) as f:
        dois = set([l.rstrip() for l in f.readlines()])

In [None]:
# Add new DOIs
dois.update(r['doi'] for r in results if 'doi' in r)

In [None]:
# Persist back to file
with open(fname, 'w') as f:
    f.writelines([doi+'\n' for doi in dois])

#### Get and persist a page of properly-formatted citations for stored set of DOIs

In [None]:
CITATION_STYLE = "apa" # https://github.com/citation-style-language/styles

headers = {"Accept": "text/x-bibliography; style={}".format(CITATION_STYLE)}
def fetch_citation(doi, headers={}):
    return requests.get(doi, headers=headers)

In [None]:
# Retrieve existing citations, which include DOIs

import codecs

citations = []
fname = 'citations-{}-{}.txt'.format(CITATION_SET, CITATION_STYLE)
if os.path.isfile(fname):
    with codecs.open(fname, 'r', encoding='utf8') as f:
        citations = [l.rstrip() for l in f.readlines() if 'doi' in l]

In [None]:
# Ensure fetching of citations only for new DOIs

import re

matches = [re.search('doi:(\S+)', c) for c in citations]
dois_done = {'http://dx.doi.org/' + m.group(1) for m in matches if m}

dois_to_fetch = list(dois - dois_done)
print "{} DOIs to fetch.".format(len(dois_to_fetch))

In [None]:
for n, doi in enumerate(dois_to_fetch):
    print "[{}/{}] Fetching {} citation for {}...".format(
        n+1, len(dois_to_fetch),CITATION_STYLE, doi)
    r = fetch_citation(doi, headers=headers)
    citations.append(r.text.encode(r.encoding).decode('utf8'))

In [None]:
# Persist back to file, sorted by descending year

def pub_year(citation):
    m = re.search('[(\s](\d{4})[).,]', citation)
    return m.group(1) if m else ''

citations = sorted(citations, key=pub_year, reverse=True)

with open(fname, 'w') as f:
    f.writelines([c.encode('utf8') + '\n\n' for c in citations])

#### [Optional] Make URLs for copy/paste of citation list into MediaWiki

In [None]:
# Write to a .wiki file
# Formats external links according to MediaWiki spec

with codecs.open(fname, 'r', encoding='utf8') as f:
    text = f.read()

with_linked_dois = re.sub(r'(doi:(\S+))', r'[http://dx.doi.org/\2 \1]', text)

with codecs.open(fname + '.wiki', 'w', encoding='utf8') as f:
    f.write(with_linked_dois)