# Extract citations and identifiers from `corpus_br` JSON files

In [1]:
import pathlib
import json
import csv
import lzma

import pandas

## Read and process `corpus_br` (bibliographic resources)

In [2]:
directory = pathlib.Path('download/2017-07-25-corpus_br/')
paths = directory.glob('*/*.json')
paths = sorted(paths, key=lambda path: int(path.stem))

In [3]:
def force_list(iterable):
    if not isinstance(iterable, list):
        iterable = [iterable]
    return iterable

In [4]:
citation_rows = list()
identier_rows = list()
for path in paths:
    with path.open() as read_file:
        part = json.load(read_file)
    for br in part['@graph']:
        iri = br['iri']
        identifiers = br.get('identifier', [])
        for identifier in force_list(identifiers):
            identier_rows.append((iri, identifier))
        citations = br.get('citation', [])
        for citation in force_list(citations):
            citation_rows.append((iri, citation))

citation_df = pandas.DataFrame(citation_rows, columns=['source', 'target'])
identifier_df = pandas.DataFrame(identier_rows, columns=['br_iri', 'id_iri'])

## Process the citations

In [5]:
# Sort source, target iri's by int rather than string order
index = [x[2] for x in sorted(zip(
    citation_df.source.str.slice(4).astype(int),
    citation_df.target.str.slice(4).astype(int),
    citation_df.index,
))]
citation_df = citation_df.loc[index]
citation_df.head(4)

Unnamed: 0,source,target
298,gbr:1,gbr:5
199,gbr:1,gbr:6
246,gbr:1,gbr:10
288,gbr:1,gbr:11


In [6]:
path = pathlib.Path('data/citations.tsv.xz')
citation_df.to_csv(path, compression='xz', index=False, sep='\t')
len(citation_df)

8652530

In [7]:
print(f'''
{len(citation_df):,} total citations
{citation_df.source.nunique():,} articles with outgoing citations
{citation_df.target.nunique():,} articles with incoming citations
{len(citation_df.query("source == target")):,} article self-citations
'''.strip())

8,652,530 total citations
203,302 articles with outgoing citations
4,972,892 articles with incoming citations
895 article self-citations


## Process the identifiers

In [8]:
id_merged_df = identifier_df.merge(
    pandas.read_table(pathlib.Path('download/2017-07-25-corpus_id.tsv.xz'), compression='xz')
    .rename(columns={'iri': 'id_iri'})
).drop('id_iri', axis='columns')

# Sort by int rather than string
index = [x[2] for x in sorted(zip(
    id_merged_df.br_iri.str.slice(4).astype(int),
    id_merged_df['type'],
    id_merged_df.index,
))]
id_merged_df = id_merged_df.loc[index]

id_merged_df.head(3)

Unnamed: 0,br_iri,type,id
1030,gbr:1,doi,10.1097/igc.0000000000000609
1032,gbr:1,pmcid,PMC4679344
1029,gbr:1,pmid,26645990


In [9]:
path = pathlib.Path('data/identifiers.tsv.xz')
id_merged_df.to_csv(path, compression='xz', index=False, sep='\t')
len(id_merged_df)

13182983