This notebook works through the process of pulling metadata via the DataCite API and building content for the Hugo site. A more elegant solution might be to simply dump DataCite metadata in its original JSON form (possibly ld_json) and then run everything with templates that read data content. However, I've found that some amount of digesting that content into markdown files with YAML metadata works pretty well and provides some ready options. So, I'm trying not to overload this part of the process.

In [1]:
import requests
import json
import yaml
import os
import shutil
from datetime import datetime


# Functions
Eventually, I may need to run this as some type of automated process in a pipeline (e.g., in the GitHub actions to load site content before deploying). The following can be pulled out into a different type of pipeline environment for that purpose.

In [24]:
# Functions for setting up the Hugo site
def write_config(config, config_path='../hugo.yml'):
    with open(config_path, 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False)
    print("WROTE CONFIG FILE")

def datacite_repositories(documents):
    repositories = [i['id'].split('/')[0] for i in documents]
    return list(set(repositories))

def build_sections(repositories, content_path='../content'):
    for prefix in repositories:
        folder_path = os.path.join(content_path, prefix)
        os.makedirs(folder_path, exist_ok=True)
        if not os.path.exists(os.path.join(folder_path, '_index.md')):
            with open(os.path.join(folder_path, '_index.md'), 'w') as f:
                f.write(f'---\ntitle: {prefix}\ndate: {str(datetime.utcnow().isoformat())}\n---\n')
        print("SETUP FOLDER/FILE STRUCTURE FOR:", prefix)

def setup_taxonomy(layouts_folder, themes_folder, names):
    for name in names:
        folder_path = os.path.join(layouts_folder, name)
        os.makedirs(folder_path, exist_ok=True)
        
        list_copy_path = os.path.join(folder_path, 'list.html')
        terms_copy_path = os.path.join(folder_path, 'terms.html')
        
        list_source_path = os.path.join(themes_folder, 'list.html')
        terms_source_path = os.path.join(themes_folder, 'terms.html')
        
        if not os.path.exists(list_copy_path):
            shutil.copy(list_source_path, list_copy_path)
        
        if not os.path.exists(terms_copy_path):
            shutil.copy(terms_source_path, terms_copy_path)
        print("SETUP FOLDER/FILE STRUCTURE FOR:", name)

# Functions for building content items from DataCite documents
def datacite_categories(doc):
    categories = [doc['attributes']['types']['resourceTypeGeneral']]
    return categories

def datacite_tags(doc):
    tags = []
    for subject in doc['attributes']['subjects']:
        if subject['subject']:
            if ',' in subject['subject']:
                tags.extend([i.strip() for i in subject['subject'].split(',')])
            else:
                tags.append(subject['subject'])
    return tags

def datacite_publishers(doc):
    publishers = [doc['attributes']['publisher']]
    return publishers

def datacite_creators(doc):
    authors = []
    affiliations = []
    for creator in doc['attributes']['creators']:
        if creator['affiliation']:
            affiliations.extend(creator['affiliation'])
        name_string = creator['name']
        if creator['nameType'] == 'Personal' and 'givenName' in creator and 'familyName' in creator:
            name_string = f"{creator['givenName']} {creator['familyName']}"
        authors.append(name_string)
            
    return authors, list(set(affiliations))

def datacite_funders(doc):
    funders = []
    for funder in doc['attributes']['fundingReferences']:
        if funder['funderName']:
            funders.append(funder['funderName'])
    return list(set(funders))

def datacite_orcids(doc, orcid_mapping=[]):
    for creator in doc['attributes']['creators']:
        orcid = next((i['nameIdentifier'].split('/')[-1] for i in creator['nameIdentifiers'] if i['nameIdentifierScheme'] == 'ORCID'), None)
        if not orcid:
            return orcid_mapping

        name_string = creator['name']
        if creator['nameType'] == 'Personal' and 'givenName' in creator and 'familyName' in creator:
            name_string = f"{creator['givenName']} {creator['familyName']}"
        orcid_mapping.append((name_string, orcid))

    return orcid_mapping

def datacite_meta(doc):
    meta_content = ['---']
    title = doc['attributes']['titles'][0]['title']
    meta_content.append(f'title: "{title}"')
    meta_content.append(f"doi: {doc['id']}")
    meta_content.append(f"date: {doc['attributes']['updated']}")
    # meta_content.append(f"date: {doc['attributes']['publicationYear']}")
    meta_content.append(f"categories: {datacite_categories(doc)}")
    meta_content.append(f"tags: {datacite_tags(doc)}")
    meta_content.append(f"publishers: {datacite_publishers(doc)}")
    authors, affiliations = datacite_creators(doc)
    if authors:
        meta_content.append(f"author: {authors}")
    if affiliations:
        meta_content.append(f"affiliations: {affiliations}")
    meta_content.append(f"funders: {datacite_funders(doc)}")
    meta_content.append("---")
    return '\n'.join(meta_content)

def datacite_md(doc):
    md = datacite_meta(doc)
    
    abstract = next((i['description'] for i in doc['attributes']['descriptions'] if i['descriptionType'] == 'Abstract'), None)
    if abstract:
        md+= '\n\n# Abstract'
        md+= f'\n{abstract}'
    other_descriptions = [i for i in doc['attributes']['descriptions'] if i['descriptionType'] != 'Abstract']
    if other_descriptions:
        for desc in other_descriptions:
            md+= f'\n\n## {desc["descriptionType"]}'
            md+= f'\n{desc["description"].lstrip("#").strip()}'

    if doc['attributes']['url']:
        md+= f'\n\n# Access Points\n{doc["attributes"]["url"]}'
    
    return md

# Hugo Config File
There are a number of things that need to be set up in the Hugo configuration file. Most of this has to do with setting up the [taxonomies](https://gohugo.io/content-management/taxonomies/) that we'll be using, which is essentially the key aspect of processing DataCite repository records into their most useful form, providing several simple ways to browse through and find content of interest.

You can manage the config file however you want. I've provided one option here consisting of a site_config dictionary object that you can tweak here in the notebook and then dump to YAML.

In [21]:
# Set this parameter based on where you are deploying the site
# Note that testing locally with hugo server will "ignore" this parameter
# (other than the subdomain) and use localhost:1313 (or whatever port you specify)
base_url = 'https://datapurifier.github.io/landingpage-in-a-box/'

site_config = {
    'baseURL': base_url,
    'languageCode': 'en-us',
    'title': 'Landingpage-in-a-Box',
    'theme': 'PaperMod',
    'taxonomies': {
        'category': 'categories',
        'tag': 'tags',
        'publishers': 'publishers',
        'author': 'author',
        'affiliations': 'affiliations'
    },
    'params': {
        'profileMode': {
            'enabled': True,
            'title': 'Landingpage-in-a-Box',
            'subtitle': 'A lightweight approach to a metadata-driven dataset landing page web site',
            'buttons': [
                {
                    'name': 'Search', 
                    'url': 'search'
                },
                {
                    'name': 'Tags',
                    'url': 'tags'
                },
                {
                    'name': 'Categories', 
                    'url': 'categories'
                },
                {
                    'name': 'Authors', 
                    'url': 'author'
                },
                {
                    'name': 'Affiliations', 
                    'url': 'affiliations'
                },
                {
                    'name': 'Publishers', 
                    'url': 'publishers'
                }
            ]
        }
    },
    'ShowRssButtonInSectionTermList': True,
    'outputs': {
        'home': [
            'HTML', 
            'RSS', 
            'JSON'
        ]
    }
}

write_config(site_config)

WROTE CONFIG FILE


# Get DataCite Items
This part of the process is really up to the individual use case. Any type of process is perfectly fine here as long as it returns some set of items in DataCite's native JSON format. This could also be retuned to work with JSON-LD or any other output format desired. The predominant use case is likely to work through a set of items from one or more DataCite repositories (DOI prefixes) that do not otherwise have landing pages in some primary source repository. But this can also be used to spin up some particular context of assets that need to be presented in a particular way.

In [26]:
datacite_api = "https://api.datacite.org/dois?prefix=10.5066&page[size]=100"
items = requests.get(datacite_api).json()
print("DataCite DOCUMENTS:", len(items['data']))

DataCite DOCUMENTS: 100


# Build DataCite Repository Sections
We want the DOI prefixes in our collection from DataCite to act as sections within the Hugo site. This means setting up root folders within /content/ for each DOI prefix in our recordset returned from the DataCite API. We'll them write markdown files to these with the remainder of the DOI identifier to provide logical paths at the root of our site that match the DOI. We also write _index.md files into each DOI prefix folder so that it is treated as a section in Hugo's architecture. This will also provide a listing of items at that path depending on the template used.

In [25]:
build_sections(datacite_repositories(items['data']))

SETUP FOLDER/FILE STRUCTURE FOR: 10.5066


# Setup Taxonomy
The taxonomies create the navigational structure for the site. They are entirely dependent on how we process DataCite metadata and what we build from that into each content item for the site.

Two taxonomies are already established - categories and tags. We place resourceTypeGeneral values into category (e.g., dataset, model, etc.). We place all "non-parsed" subjects into tags. In practice, very few DataCite records take advantage of the ability to provide URI values for individual terms or subject scheme information that will break up tags into logical taxonomies. So, this ends up being the majority of the work - using the content as provided and getting it validated and organized into better groupings.

We also have several parts of the DataCite schema that logically break out into taxonomies:
- authors - DataCite requires at least one "creator," which we often put under the term "author" in common practice. This can be reconfigured based on preference. Additional contributors can also be organized from source.
- affiliations - Creator affiliations can be broken out and included in their own taxonomy.
- publishers - The publisher field is required in DataCite and will be populated with string values that may need more clarification.
- funders - Funding institutions are often included and may incorporate additional details that can be organized into data files for use.

In [23]:
names = ['affiliations', 'author', 'funders', 'publishers']
layouts_folder = '../layouts'
themes_folder = '../themes/PaperMod/layouts/_default'

setup_taxonomy(layouts_folder, themes_folder, names)

SETUP FOLDER/FILE STRUCTURE FOR: affiliations
SETUP FOLDER/FILE STRUCTURE FOR: author
SETUP FOLDER/FILE STRUCTURE FOR: funders
SETUP FOLDER/FILE STRUCTURE FOR: publishers


# Process DataCite Records
This part of the code will continue to evolve as I work out details on what all should be placed into the markdown representation of DataCite documents and what all needs to be organized out into useful reference files (data objects in the Hugo site). Each web site document built from a DataCite document will have lists of labels in their metadata that populate the configured taxonomies. Some of these are simple name-only values that have no real further depth. Others labels associated with additional information contained in DataCite metadata (e.g., ORCIDs associated with creators/authors). Some of these can be exploited through additional processing to pull in further information from other sources. We can also process things like lists of unqualified subjects to break them out into more specific taxonomies (e.g., place names).

In [10]:
orcid_mapping = []
for document in items['data']:
    doi_prefix = document['id'].split('/')[0]
    doi_suffix = document['id'].split('/')[1]
    file_path = os.path.join('../content', doi_prefix, doi_suffix + '.md')
    with open(file_path, 'w') as f:
        f.write(datacite_md(document))

    orcid_mapping = datacite_orcids(document, orcid_mapping)

json.dump({'authors': {item[0]: item[1] for item in list(set(orcid_mapping))}}, open('../data/orcid_mapping.json', 'w'), indent=2)

## Organize Additional Data
I need to continue working on this piece, but these are the things I'm thinking about:
* Tee up all person contact names to include the additional details where provided (ORCID, givenName, familyName, etc.)
* Use ORCIDs to pull additional information on people from the ORCID registry
* Tee up all organization contact names to include additional details (creator and contributor can include identifiers, funders may also contain identifiers for the organizations and things like grant IDs)
* Many organization names are simple labels; for those with resolvable identifiers, we can pull additional details from ROR or elsewhere; we could do some work trying to disambiguate other common names
* For subjects with identifiers to source and/or scheme information, I can work through to break these out into their own taxonomies
* For subjects without identifiers, we can do some logical grouping based on entity recognition (e.g., place names, organization names, etc.); we could also leverage abstract content in this processing since we are going to be doing text processing anyway

In [82]:
authors = []
for document in items['data']:
    if document['attributes']['creators']:
        for creator in document['attributes']['creators']:
            if "nameIdentifiers" in creator:
                orcid_url = next((i['nameIdentifier'] for i in creator['nameIdentifiers'] if i['nameIdentifierScheme'] == 'ORCID'), None)
                if orcid_url:
                    if creator['nameType'] == 'Personal':
                        if 'givenName' in creator:
                            name = f"{creator['givenName']} {creator['familyName']}"
                        else:
                            name = creator['name']
                    elif creator['nameType'] == 'Organizational':
                        name = creator['name']
                    authors.append({
                        'title': name,
                        'orcid': orcid_url,
                        'url': "/authors/" + orcid_url.split('/')[-1]
                    })
unique_authors = list(set(tuple(sorted(author.items())) for author in authors))
unique_authors = [dict(author) for author in unique_authors]

for author in unique_authors:
    orcid = author['orcid'].split('/')[-1]
    if not os.path.exists(os.path.join('../content/authors', orcid)):
        os.makedirs(os.path.join('../content/authors', orcid))
    file_path = os.path.join('../content/authors', orcid, '_index.md')
    yaml_content = "---\n" + yaml.dump(author, default_flow_style=False) + "\n---"
    with open(file_path, 'w') as f:
        f.write(yaml_content)