# Download OpenCitations Data
https://doi.org/10.6084/m9.figshare.5255323.v1

In [1]:
import collections
import itertools
import pathlib
import urllib.request
import zipfile
import subprocess

import pandas
import requests

## Get list of all figshare files for a specified OpenCitations release

In [2]:
def get_fishare_files(figshare_id):
    url = f'https://api.figshare.com/v2/articles/{figshare_id}'
    response = requests.get(url)
    article = response.json()
    files = article['files']
    for f in files:
        row = {
            'doi': article['doi'],
            'figshare_id': figshare_id,
            'file_name': f['name'],
            'download_url': f['download_url'],
        }
        yield collections.OrderedDict(row)

In [3]:
# Figshare article IDs from "25 July 2017 Dump" on http://opencitations.net/download
figshare_ids = [
    5255386,
    5255323,
    5255365,
    5255368,
    5255359,
    5255395,
    5255215,
]

files = list(itertools.chain.from_iterable(get_fishare_files(figshare_id) for figshare_id in figshare_ids))
figshare_df = pandas.DataFrame(files)
path = pathlib.Path('download/figshare-files.tsv')
figshare_df.to_csv(path, index=False, sep='\t')
figshare_df.head(2)

Unnamed: 0,doi,figshare_id,file_name,download_url
0,10.6084/m9.figshare.5255386.v1,5255386,LICENSE.txt,https://ndownloader.figshare.com/files/8986366
1,10.6084/m9.figshare.5255386.v1,5255386,README-ar.txt,https://ndownloader.figshare.com/files/8986369


## Download and extract files

This section requires the [Disk ARchive](http://dar.linux.free.fr/) utility to be installed.

In [4]:
release_date = '2017-07-25'
download_files = [
    f'{release_date}-corpus_id.zip',
    f'{release_date}-corpus_br.zip',
]

In [5]:
for name in download_files:
    # Configure
    info = figshare_df.query('file_name == @name').iloc[0]
    path = pathlib.Path(f'download/{name}')
    directory = path.with_suffix('')
    print(f'Beginning process to create {directory}')

    # Download file from figshare
    if not path.is_file():
        urllib.request.urlretrieve(info.download_url, path)

    # Unzip file
    with zipfile.ZipFile(path) as zip_file:
        zip_file.extractall(directory)

    # Use dar to reconstruct filesystem
    args = [
        'dar',  # http://dar.linux.free.fr/
        '-O',  # do not consider user and group ownership
        '-x',  # extracts files from the archive
        directory.name,
    ]
    process = subprocess.run(args, stderr=subprocess.PIPE, cwd=directory)
    assert process.returncode == 0
    # Delete DAR files which have been extracted
    for dar_path in directory.glob('*.dar'):
        dar_path.unlink()

Beginning process to create download/2017-07-25-corpus_br
