# Process citations and retrieve metadata

In [1]:
import json
import re
import os
import pathlib
import subprocess

import pandas

from citations import (
    citation_to_metadata,
    citeproc_passthrough,
    get_references_from_text,
    get_text,
    validate_reference,
)

In [2]:
ref_dir = pathlib.Path('../references')
gen_dir = ref_dir.joinpath('generated')

In [3]:
text = get_text('../sections')
refs = sorted(get_references_from_text(text))
bad_refs = list(filter(None, map(validate_reference, refs)))
if bad_refs:
    print('\n'.join(bad_refs))
    assert False
ref_df = pandas.DataFrame({'text': refs})
tag_df = pandas.read_table(ref_dir.joinpath('tags.tsv'))
tag_df['text'] = '@tag:' + tag_df.tag
ref_df = ref_df.merge(tag_df[['text', 'citation']], how='left')
ref_df.citation.fillna(ref_df.text.str.lstrip('@'), inplace=True)

In [4]:
ref_df.head(3)

Unnamed: 0,text,citation
0,@arxiv:1409.0575,arxiv:1409.0575
1,@arxiv:1411.2581v1,arxiv:1411.2581v1
2,@arxiv:1510.02855,arxiv:1510.02855


In [5]:
def get_standard_citatation(citation, cache):
    try:
        metadata = citation_to_metadata(citation, cache)
        return metadata['standard_citation'], metadata['citation_id']
    except Exception as e:
        print(citation, e)
        return None, None

In [6]:
cache_path = gen_dir.joinpath('citations.json')
use_cache = cache_path.exists() and 'REFRESH_METADATA_CACHE' not in os.environ
print('Using metadata cache:', use_cache)
if use_cache:
    with gen_dir.joinpath('citations.json').open() as read_file:
        metadata_cache = json.load(read_file)
else:
    metadata_cache = {}

ref_df['standard_citation'], ref_df['citation_id'] = zip(*ref_df.citation.apply(
    get_standard_citatation, cache=metadata_cache))

Using metadata cache: False


In [7]:
ref_df.head(3)

Unnamed: 0,text,citation,standard_citation,citation_id
0,@arxiv:1409.0575,arxiv:1409.0575,arxiv:1409.0575,ref_0
1,@arxiv:1411.2581v1,arxiv:1411.2581v1,arxiv:1411.2581v1,ref_1
2,@arxiv:1510.02855,arxiv:1510.02855,arxiv:1510.02855,ref_2


In [8]:
print(f'''
{len(ref_df)} unique citations strings extracted from text
{ref_df.standard_citation.nunique()} unique citations after standardizations
'''.strip())

118 unique citations strings extracted from text
116 unique citations after standardizations


In [9]:
# Duplicated citations
ref_df[ref_df.standard_citation.duplicated(keep=False)]

Unnamed: 0,text,citation,standard_citation,citation_id
26,@doi:10.1016/j.neunet.2014.09.003,doi:10.1016/j.neunet.2014.09.003,doi:10.1016/j.neunet.2014.09.003,ref_26
46,@doi:10.1101/073239,doi:10.1101/073239,doi:10.1101/073239,ref_46
105,@tag:Schmidhuber2014_dnn_overview,doi:10.1016/j.neunet.2014.09.003,doi:10.1016/j.neunet.2014.09.003,ref_26
113,@tag:Wang2016_protein_contact,doi:10.1101/073239,doi:10.1101/073239,ref_46


In [10]:
converted_text = text
for old, new in zip(ref_df.text, '@' + ref_df.citation_id):
    old = re.escape(old)
    converted_text = re.sub(old + '(?=[\s\]])', new, converted_text)

with gen_dir.joinpath('all-sections.md').open('wt') as write_file:
    write_file.write(converted_text)

In [11]:
path = gen_dir.joinpath('processed-citations.tsv')
ref_df.to_csv(path, sep='\t', index=False)

with cache_path.open('wt') as write_file:
    json.dump(metadata_cache, write_file, indent=2, ensure_ascii=False)

In [12]:
csl_items = list()
bibtex_stanzas = list()
for metadata in metadata_cache.values():
    if 'citeproc' in metadata:
        csl_items.append(metadata['citeproc'])
    elif 'bibtex' in metadata:
        bibtex_stanzas.append(metadata['bibtex'])

bib_path = gen_dir.joinpath('bibliography.bib')
with bib_path.open('wt') as write_file:
    write_file.write('\n'.join(bibtex_stanzas))

bib_items = subprocess.check_output(['pandoc-citeproc', '--bib2json', bib_path])
bib_items = json.loads(bib_items)
csl_items.extend(map(citeproc_passthrough, bib_items))

In [13]:
with gen_dir.joinpath('bibliography.json').open('wt') as write_file:
    json.dump(csl_items, write_file, indent=2, ensure_ascii=False)