# Export all works from Crossref to MongoDB

In [1]:
import logging

import pymongo
import requests
import ratelimit
import tqdm

In [2]:
client = pymongo.MongoClient('localhost', 27017)

In [3]:
crossref_db = client.crossref
works = crossref_db.works

In [4]:
works.create_index('DOI', unique=True)

'DOI_1'

In [5]:
@ratelimit.rate_limited(15)
def api_query(endpoint='works', rows=20, cursor=None):
    """
    https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md
    """
    url = f'https://api.crossref.org/{endpoint}'
    params = {
        'rows': rows,
        'cursor': cursor
    }
    response = requests.get(url, params)
    return response

In [None]:
def get_all_works(works_per_query=20, stop_at=None):
    """
    Return a generator of all Crossref works.
    """
    cursor = '*'
    incomplete = True
    progress_bar = None
    while incomplete:

        response = api_query(cursor=cursor, rows=works_per_query)

        # HTTP Request failed
        if response.status_code != 200:
            msg = f'{response.url} returned status_code {response.status_code}:\n{response.text}'
            logging.warning(msg)
            continue

        result = response.json()
        # JSON payload is not okay
        if result.get('status') != 'ok':
            msg = f'{response.url} returned:\n{result}'
            logging.warning(msg)
            continue

        cursor = result['message']['next-cursor']
        if progress_bar is None:
            total = result['message']['total-results']
            progress_bar = tqdm.tqdm_notebook(desc='works', total=total)

        items = result['message']['items']
        yield from items
        progress_bar.update(len(items))
        incomplete = bool(items)

        if stop_at and progress_bar.n > stop_at:
            break

In [None]:
work_generator = get_all_works(works_per_query=1000, stop_at=None)
for work in work_generator:
    filter_ = {'DOI': work['DOI']}
    works.replace_one(filter_, work, upsert=True)

6993000/|/  8%|| 6993000/87331224 [13:18:07<137:47:10, 161.96it/s]

<html><body><h1>504 Gateway Time-out</h1>
The server didn't respond in time.
</body></html>

<html><body><h1>504 Gateway Time-out</h1>
The server didn't respond in time.
</body></html>

<html><body><h1>504 Gateway Time-out</h1>
The server didn't respond in time.
</body></html>

<html><body><h1>504 Gateway Time-out</h1>
The server didn't respond in time.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>

<html><body><h1>503 Service Unavailable</h1>
No server is available to handle this reque

In [None]:
works.count()

In [None]:
works.find_one()

In [None]:
client.close()