Skip to content

Commit

Permalink
issue: #10 Mint DOI on record publication [more]
Browse files Browse the repository at this point in the history
- Serialize record metadata for DataCite
- Fixed hidden bug: RequestError not imported
- Mint DOI externally via asynchronous Celery task
  * Set PIDStatus to REGISTERED
  * Set retrieved DOI in PersistentIdentifier
- Display DOI on Record Page
- Load different environment variables in test and live
- Set Flask recognized configuration variables
- Make invenio_{records,deposit}_ui.recid url generation
  be request independent
- Closes: #10 ; Closes: #269 ; Closes: #277
  • Loading branch information
fenekku committed Mar 6, 2019
1 parent 03538ad commit e0542a5
Show file tree
Hide file tree
Showing 28 changed files with 1,011 additions and 341 deletions.
5 changes: 4 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@ pytest-flask = "==0.12.0"
pytest-runner = ">=3.0.0,<5"
selenium = ">=3.4.3"
Sphinx = ">=1.5.1"
python-dotenv = "*"
pytest-env = "*"

[packages]
appnope = {version = "*",sys_platform = "== 'darwin'"}
arrow = ">=0.12.1"
Babel = ">=2.4.0"
datacite = {git = "https://github.com/caltechlibrary/datacite.git",ref = "doi-mint-enhance"}
Flask-BabelEx = ">=0.9.3"
invenio = {version = "~=3.0.0",extras = ["base", "metadata", "postgresql", "auth", "elasticsearch6"]}
invenio-accounts = ">=1.0.2"
invenio-deposit = {editable = true,git = "https://github.com/galterlibrary/invenio-deposit.git",ref = "113_fix_dependencies"}
invenio-records-files = {editable = true,git = "https://github.com/galterlibrary/invenio-records-files.git",ref = "make_non_pre"}
invenio-files-rest = {editable = true,git = "https://github.com/galterlibrary/invenio-files-rest.git",ref = "make_non_pre"}
invenio-records-rest = ">=1.1.0"
invenio-records-rest = {version = ">=1.1.0",extras = ["datacite"]}
IPython = "<7.0.0"
marshmallow = ">=2.15.5,<2.30.0"
SQLAlchemy-Continuum = "==1.3.4"
Expand Down
513 changes: 272 additions & 241 deletions Pipfile.lock

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ instructions. You only need to execute them once to setup your environment:
Note: You may want to add ``PIPENV_VENV_IN_PROJECT=1`` to your shell
(``.bashrc``, ``config.fish``...) for ease of use.

3. Start the containers for the services
3. Ask your colleagues for the current `.env` file and place it in the root
directory of the project. This file contains the sensitive or
"live specific" environment variables you will need.

4. Start the containers for the services

.. code-block:: console
Expand All @@ -68,7 +72,7 @@ instructions. You only need to execute them once to setup your environment:
This will create and run 4 docker containers. These containers will then
keep themselves running even across reboots.

4. Execute the Invenio initial bootstrap and setup code
5. Execute the Invenio initial bootstrap and setup code

.. code-block:: console
Expand Down
41 changes: 34 additions & 7 deletions cd2h_repo_project/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@
)
from cd2h_repo_project.modules.records.search import RecordsSearch

# When run in a container (production-like-environments), the container
# infrastructure takes care of loading environment variables, but when
# running outside a container (development-like-environments) we need to do it
# ourselves. Since dotenv is only needed in a development environment, it is
# only installed in that environment.
try:
from dotenv import load_dotenv
load_dotenv(verbose=True)
except ImportError:
pass


def _(x):
"""Identity function used to trigger string extraction."""
Expand Down Expand Up @@ -65,9 +76,9 @@ def _(x):
# Theme configuration
# ===================
#: Site name
THEME_SITENAME = _('Next Generation Research Discovery')
THEME_SITENAME = _('Next Generation Research Repository')
#: Frontpage title.
THEME_FRONTPAGE_TITLE = _('Next Generation Research Discovery')
THEME_FRONTPAGE_TITLE = _('Next Generation Research Repository')
# THEME_HEADER_LOGIN_TEMPLATE = 'invenio_theme/header_login.html'

# Email configuration
Expand Down Expand Up @@ -123,14 +134,27 @@ def _(x):
'cd2h-repo-project@localhost/cd2h-repo-project'
)

# Datacite and related Invenio-Pidstore integration
# Digital Object Identifier (DOI), Datacite and Invenio-Pidstore integration
# =================================================
PIDSTORE_DATACITE_DOI_PREFIX = '10.5072' # Test prefix, CHANGE ME in PROD
PIDSTORE_DATACITE_TESTMODE = True # Set to False in PROD

DOI_REGISTER_SIGNALS = False
"""Set this to True to mint DOIs."""
DOI_PUBLISHER = "YOUR PLATFORM NAME"
"""REQUIRED if DOI_REGISTER_SIGNALS is True. Set this to your repository's
institution or name."""
PIDSTORE_DATACITE_USERNAME = ''
"""REQUIRED if DOI_REGISTER_SIGNALS is True. Set this to your DataCite client
account."""
PIDSTORE_DATACITE_PASSWORD = ''
PIDSTORE_DATACITE_URL = ''
"""REQUIRED if DOI_REGISTER_SIGNALS is True. Set this to your DataCite client
account password."""
PIDSTORE_DATACITE_DOI_PREFIX = ''
"""REQUIRED if DOI_REGISTER_SIGNALS is True. Change this to your institution's
DOI prefix."""
PIDSTORE_DATACITE_TESTMODE = True
"""Whether to interact with DataCite in test mode or not.
Set to False in production"""
PIDSTORE_DATACITE_URL = "https://mds.datacite.org"
"""The DataCite minting endpoint."""

# JSONSchemas
# ===========
Expand Down Expand Up @@ -161,6 +185,9 @@ def _(x):
#: should be set to the correct host and it is strongly recommended to only
#: route correct hosts to the application.
APP_ALLOWED_HOSTS = ['localhost', '127.0.0.1']
#: It should be set to your server name
SERVER_NAME = 'localhost:5000'
PREFERRED_URL_SCHEME = 'https'

# OAI-PMH
# =======
Expand Down
1 change: 1 addition & 0 deletions cd2h_repo_project/modules/doi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Digital Object Identifier module."""
21 changes: 21 additions & 0 deletions cd2h_repo_project/modules/doi/ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2018 NU,FSM,GHSL.
#
# This is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Flask extension for DOI."""


class DigitalObjectIdentifier(object):
"""Digital Object Identifier extension."""

def __init__(self, app=None):
"""Extension initialization."""
if app:
self.init_app(app)

def init_app(self, app):
"""Flask application initialization."""
app.extensions['cd2h-doi'] = self
6 changes: 6 additions & 0 deletions cd2h_repo_project/modules/doi/links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""DOI links."""


def doi_url_for(doi_value):
"""Return the URL for the DOI."""
return 'https://doi.org/' + str(doi_value).strip('/')
39 changes: 39 additions & 0 deletions cd2h_repo_project/modules/doi/minters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Local DOI minter."""

from flask import current_app
from invenio_pidstore.models import PersistentIdentifier, PIDStatus


def mint_record_doi(record_uuid, data):
"""Mint doi PersistentIdentifier in an initial New state.
Because DOIs are minted by an external service, we create a PID for
tracking purposes, but do not mark it as Registered NOR do we specify the
final DOI value upfront. A unique and temporary doi value is put in the DB
until the real (final) DOI value is minted and provided to us by the
external service. For that reason, we don't pass that DOI value back in
`data`.
An asynchronous task will update this PID with results from the external
service.
A doi PersistentIdentifier can only be minted if the Record has an
associated recid PersistentIdentifier and it has not been doi minted
before.
:param record_uuid: Record object uuid
:param data: Record object as dict (or dict-like).
:returns: doi PersistentIdentifier
"""
recid_field = current_app.config['PIDSTORE_RECID_FIELD']
assert recid_field in data and 'doi' not in data
pid = PersistentIdentifier.create(
'doi',
data['id'], # This is a purposefully unique but temporary value
pid_provider='datacite',
object_type='rec',
object_uuid=record_uuid,
status=PIDStatus.NEW,
)
data['doi'] = ''
return pid
103 changes: 103 additions & 0 deletions cd2h_repo_project/modules/doi/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""JSON Schemas."""
from datetime import date

from flask import current_app
from marshmallow import Schema, fields


class DataCiteResourceTypeSchemaV4(Schema):
"""ResourceType schema."""

resourceTypeGeneral = fields.Method('get_general_resource_type')
resourceType = fields.Method('get_specific_resource_type')

def get_general_resource_type(self, resource_type):
"""Extract general_resource_type.
TODO: Settle on general resource types and use those.
We just provide a default for now.
"""
return resource_type.get('general', 'Dataset')

def get_specific_resource_type(self, resource_type):
"""Extract specific resource type.
TODO: Settle on specific resource types (if any) and use those.
We just provide a default for now.
"""
return resource_type.get('specific', 'Dataset')


class DataCiteTitleSchemaV4(Schema):
"""Title schema."""

title = fields.Str()


class DataCiteCreatorSchemaV4(Schema):
"""Creator schema."""

# Note: Marshmallow doesn't try to automatically extract a field
# corresponding to a fields.Method.
creatorName = fields.Method('get_creator_name')
# TODO optional:
# givenName
# familyName

def get_creator_name(self, author):
"""Extract creator name."""
name_parts = author.strip().split()
if len(name_parts) >= 2:
return "{last_name}, {first_name}".format(
last_name=name_parts[-1], first_name=name_parts[0])
else:
return ''


class DataCiteSchemaV4(Schema):
"""Schema for DataCite Metadata.
For now, only the minimum required fields are implemented. In the future,
we may want to include optional fields as well.
Fields and subfields are based on
schema.datacite.org/meta/kernel-4.1/doc/DataCite-MetadataKernel_v4.1.pdf
"""

identifier = fields.Method(
'get_identifier',
attribute='metadata.doi',
dump_only=True)
creators = fields.List(
fields.Nested(DataCiteCreatorSchemaV4),
attribute='metadata.author',
dump_only=True)
titles = fields.List(
fields.Nested(DataCiteTitleSchemaV4),
attribute='metadata',
dump_only=True)
publisher = fields.Method('get_publisher', dump_only=True)
publicationYear = fields.Method('get_year', dump_only=True)
resourceType = fields.Nested(
DataCiteResourceTypeSchemaV4,
attribute='metadata', # TODO: 'metadata.resource_type' when added
dump_only=True)

def get_identifier(self, obj):
"""Get record main identifier."""
return {
'identifier': obj['metadata'].get('doi', ''),
'identifierType': 'DOI'
}

def get_publisher(self, data):
"""Extract publisher."""
return current_app.config['DOI_PUBLISHER']

def get_year(self, data):
"""Extract year.
Current year for now.
TODO: Revisit when dealing with embargo.
"""
return date.today().year
8 changes: 8 additions & 0 deletions cd2h_repo_project/modules/doi/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""DOI serializer for external service."""

from invenio_records_rest.serializers.datacite import DataCite41Serializer

from .schemas import DataCiteSchemaV4

# Datacite format serializer
datacite_v41 = DataCite41Serializer(DataCiteSchemaV4, replace_refs=True)
87 changes: 87 additions & 0 deletions cd2h_repo_project/modules/doi/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""CD2H Celery tasks."""
import re

from celery import shared_task
from datacite.client import DataCiteMDSClient
from datacite.errors import DataCiteError, HttpError
from elasticsearch.exceptions import RequestError
from flask import current_app, url_for
from invenio_db import db
from invenio_indexer.api import RecordIndexer
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
from invenio_records_files.api import Record

from cd2h_repo_project.modules.doi.serializers import datacite_v41
from cd2h_repo_project.modules.records.links import (
url_for_record_ui_recid_external
)
from cd2h_repo_project.modules.records.resolvers import record_resolver


def extract_doi(status_str):
"""Extract minted DOI from response.
Exceptions percolate.
"""
return re.search("\(([-.a-zA-Z0-9\/]+)\)$", status_str).group(1)


@shared_task(ignore_result=True, max_retries=6, default_retry_delay=10 * 60,
rate_limit='100/m')
def register_doi(recid_pid_value):
"""External DOI registration task.
This asynchronous task mints a DOI with the external service and
stores it in the local doi PID. It will retry a `max_retries` number of
times if the service is down. It will not retry for other errors
(internal ones).
`default_retry_delay` is in seconds.
:param recid_pid_value: pid_value of recid PID for the target record.
Note that this pid_value is also the pid_value of
the doi PID associated with the target record if
it has not been DOI minted yet.
"""
try:
pid, record = record_resolver.resolve(str(recid_pid_value))

doi_pid_value = record.get('doi') or recid_pid_value
doi_pid = PersistentIdentifier.get(
pid_type='doi', pid_value=doi_pid_value)

client = DataCiteMDSClient(
username=current_app.config['PIDSTORE_DATACITE_USERNAME'],
password=current_app.config['PIDSTORE_DATACITE_PASSWORD'],
prefix=current_app.config['PIDSTORE_DATACITE_DOI_PREFIX'],
test_mode=current_app.config['PIDSTORE_DATACITE_TESTMODE'],
url=current_app.config['PIDSTORE_DATACITE_URL']
)

# Mint DOI
serialized_record = datacite_v41.serialize(doi_pid, record)
result = client.metadata_post(serialized_record)

# Associate landing page to DOI on DataCite if new
if doi_pid.status == PIDStatus.NEW:
minted_doi = extract_doi(result)
landing_url = url_for_record_ui_recid_external(recid_pid_value)
result = client.doi_post(minted_doi, landing_url)

# Update doi_pid
doi_pid.pid_value = minted_doi
doi_pid.register()
record['doi'] = minted_doi
record.commit()
# Necessary but unclear why: (TODO: Investigate)
# - call to record.commit() is done above and
# - tests don't need it
db.session.commit()

# Re-index record
RecordIndexer().index(record)

except (HttpError, DataCiteError) as e:
register_doi.retry(exc=e)
except RequestError:
current_app.logger.exception('Could not index {}.'.format(record))
Loading

0 comments on commit e0542a5

Please sign in to comment.