This notebook works through the process of pulling metadata via the DataCite API and building content for the Hugo site. A more elegant solution might be to simply dump DataCite metadata in its original JSON form (possibly ld_json) and then run everything with templates that read data content. However, I've found that some amount of digesting that content into markdown files with YAML metadata works pretty well and provides some ready options. So, I'm trying not to overload this part of the process.

In [247]:
import requests
import pandas as pd
import json
import yaml
import os
import shutil
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point, Polygon
from itertools import groupby


# Functions
Eventually, I may need to run this as some type of automated process in a pipeline (e.g., in the GitHub actions to load site content before deploying). The following can be pulled out into a different type of pipeline environment for that purpose.

In [274]:
# Lookup lists

# DataCite contributor types with button labels for browse lists
contributorType = {
    'author': 'Authors',
    'ContactPerson': 'Contact Persons', 
    'DataCollector': 'Data Collectors',
    'DataCurator': 'Data Curators',
    'DataManager': 'Data Managers',
    'Distributor': 'Distibutors',
    'Editor': 'Editors',
    'HostingInstitution': 'Hosting Institutions',
    'Producer': 'Producers',
    'ProjectLeader': 'Project Leaders',
    'ProjectManager': 'Project Managers', 
    'ProjectMember': 'Project Members',
    'RegistrationAgency': 'Registration Agencies',
    'RegistrationAuthority': 'Registration Authorities',
    'RelatedPerson': 'Related Persons',
    'Researcher': 'Researchers',
    'ResearchGroup': 'Research Groups',
    'RightsHolder': 'Rights Holders',
    'Sponsor': 'Sponsors',
    'Supervisor': 'Supervisors',
    'WorkPackageLeader': 'Work Package Leaders',
    'Other': 'Other Contributors'
}

# Functions for setting up the Hugo site
def write_config(config, config_path='../hugo.yml'):
    with open(config_path, 'w') as f:
        yaml.safe_dump(config, f, default_flow_style=False)
    print("WROTE CONFIG FILE")

def datacite_repositories(documents):
    repositories = [i['id'].split('/')[0] for i in documents]
    return list(set(repositories))

def clear_content_folders(content_path='../content'):
    for item in os.listdir(content_path):
        item_path = os.path.join(content_path, item)
        
        if os.path.isdir(item_path):
            shutil.rmtree(item_path)

def build_sections(repositories, content_path='../content', clear_folders=True):
    if clear_folders:
        clear_content_folders(content_path)
        print("CLEARED CONTENT FOLDERS")

    for prefix in repositories:
        folder_path = os.path.join(content_path, prefix)
        os.makedirs(folder_path, exist_ok=True)
        if not os.path.exists(os.path.join(folder_path, '_index.md')):
            with open(os.path.join(folder_path, '_index.md'), 'w') as f:
                f.write(f'---\ntitle: {prefix}\ndate: {str(datetime.utcnow().isoformat())}\n---\n')
        print("SETUP FOLDER/FILE STRUCTURE FOR:", prefix)

def setup_taxonomy(layouts_folder, themes_folder, names):
    for name in names:
        folder_path = os.path.join(layouts_folder, name)
        os.makedirs(folder_path, exist_ok=True)
        
        list_copy_path = os.path.join(folder_path, 'list.html')
        terms_copy_path = os.path.join(folder_path, 'terms.html')
        
        list_source_path = os.path.join(themes_folder, 'list.html')
        terms_source_path = os.path.join(themes_folder, 'terms.html')
        
        if not os.path.exists(list_copy_path):
            shutil.copy(list_source_path, list_copy_path)
        
        if not os.path.exists(terms_copy_path):
            shutil.copy(terms_source_path, terms_copy_path)
        print("SETUP FOLDER/FILE STRUCTURE FOR:", name)

# Functions for parsing contacts from creators and contributors
def concat_contacts(documents):
    contacts = []
    for doc in documents:
        contacts.extend(doc['attributes']['creators'])
        contacts.extend(doc['attributes']['contributors'])
    return contacts

def contact_label(doc):
    if ('givenName' in doc and doc['givenName'] and len(doc['givenName'].strip()) > 0) and ('familyName' in doc and doc['familyName'] and len(doc['familyName'].strip()) > 0):
        return f"{doc['givenName']} {doc['familyName']}"
    else:
        return doc['name']

def label_contacts(contact_docs):
    labeled_contacts = {}
    for doc in contact_docs:
        labeled_contacts[contact_label(doc)] = doc
    return labeled_contacts

def contact_taxonomies(labeled_contacts):
    contact_taxonomies = {
        'author': {k: v for k, v in labeled_contacts.items() if not v.get('contributorType')}
    }
    contributor_types = set([contact.get('contributorType') for contact in labeled_contacts.values() if contact.get('contributorType')])
    for contributor_type in contributor_types:
        contact_taxonomies[contributor_type] = {}

    for contributor_type in contributor_types:
        contributors = {k: v for k, v in labeled_contacts.items() if v.get('contributorType') == contributor_type}
        if contributors:
            contact_taxonomies[contributor_type] = contributors

    return contact_taxonomies

# Function for building funder references
def datacite_funders(documents):
    funding_references = []
    for document in documents:
        for ref in document['attributes']['fundingReferences']:
            ref['doi'] = document['id']
            if 'funderIdentifierType' in ref and ref['funderIdentifierType'] == 'ROR':
                if  not ref['funderIdentifier'].startswith('http'):
                    ref['funderIdentifier'] = f'https://ror.org/{ref["funderIdentifier"]}'
                ref['ror'] = ref['funderIdentifier'].split('/')[-1]
            if 'ror' in ref:
                ref['label'] = f"{ref['funderName']} ({ref['ror']})"
            else:
                ref['label'] = ref['funderName']

            funding_references.append(ref)

    funders = {}
    for funder_label in list(set([i['label'] for i in funding_references])):
        funder_awards = [i for i in funding_references if i['label'] == funder_label]
        funders[funder_label] = {
            'funderIdentifier': next((i['funderIdentifier'] for i in funder_awards if 'funderIdentifier' in i), None),
            'funderName': next((i['funderName'] for i in funder_awards if 'funderName' in i), None),
            'awards': []
        }
        for award in funder_awards:
            funders[funder_label]['awards'].append({k:v for k,v in award.items() if k == 'doi' or k.startswith('award')})

    funder_index = {}
    for funder, funder_info in funders.items():
        for award in funder_info['awards']:
            funder_index[award['doi']] = funder
            
    return funders, funder_index

# Functions for building subject references
def collection_tags(documents):
    tags = {}
    for doc in documents:
        if 'subjects' in doc['attributes'] and doc['attributes']['subjects']:
            for subject in doc['attributes']['subjects']:
                if subject['subject'] not in tags:
                    subject_properties = {k:v for k,v in subject.items() if k != 'subject'}
                    tags[subject['subject']] = subject_properties
    return tags

# Functions for building content items from DataCite documents
def datacite_categories(doc):
    categories = [doc['attributes']['types']['resourceTypeGeneral']]
    return categories

def datacite_subjects(doc):
    tags = [i['subject'] for i in doc['attributes']['subjects'] if 'subjects' in doc['attributes']]
    return tags

def datacite_publishers(doc):
    publishers = [doc['attributes']['publisher']]
    return publishers

def datacite_contacts(doc):
    doc_contacts = {
        'author': [],
        'affiliations': []
    }
    for creator in doc['attributes']['creators']:
        if creator['affiliation']:
            doc_contacts['affiliations'].extend(creator['affiliation'])
        doc_contacts['author'].append(contact_label(creator))
    
    for contributor in doc['attributes']['contributors']:
        if 'contributorType' in contributor and contributor['contributorType']:
            if contributor['contributorType'] not in doc_contacts:
                doc_contacts[contributor['contributorType']] = []
            if creator['affiliation']:
                doc_contacts['affiliations'].extend(creator['affiliation'])
            doc_contacts[contributor['contributorType']].append(contact_label(contributor))

    for key in doc_contacts:
        doc_contacts[key] = list(set(doc_contacts[key]))

    return doc_contacts

def datacite_orcids(doc, orcid_mapping=[]):
    for creator in doc['attributes']['creators']:
        orcid = next((i['nameIdentifier'].split('/')[-1] for i in creator['nameIdentifiers'] if i['nameIdentifierScheme'] == 'ORCID'), None)
        if not orcid:
            return orcid_mapping

        name_string = creator['name']
        if creator['nameType'] == 'Personal' and 'givenName' in creator and 'familyName' in creator:
            name_string = f"{creator['givenName']} {creator['familyName']}"
        orcid_mapping.append((name_string, orcid))

    return orcid_mapping

def datacite_meta(doc):
    meta_content = ['---']
    # Set the main title
    title = next((i['title'] for i in doc['attributes']['titles'] if 'titleType' not in i or not i['titleType']), None)
    title = title.replace('"', "'")
    meta_content.append(f'title: "{title}"')

    # Set doi and url identifiers
    meta_content.append(f"doi: {doc['id']}")
    meta_content.append(f"referralUrl: {doc['attributes']['url']}")

    # Add resourceTypeGeneral as a category
    meta_content.append("categories:")
    meta_content.append(f"- {doc['attributes']['types']['resourceTypeGeneral']}")

    # Add subjects as tags
    subjects = list(set([i['subject'] for i in doc['attributes']['subjects'] if 'subjects' in doc['attributes']]))
    if subjects:
        meta_content.append("tags:")
        for term in subjects:
            meta_content.append(f"- {term}")

    # Add publishers to the meta content
    meta_content.append("publishers:")
    meta_content.append(f"- {doc['attributes']['publisher']}")

    # Pull creators as authors and affiliations from the creators data structure
    meta_content.append("author:")
    for item in doc['attributes']['creators']:
        meta_content.append(f"- {item['name']}")

    affiliations = []
    for c in doc['attributes']['creators']:
        if c['affiliation']:
            affiliations.extend(c['affiliation'])
        affiliations = list(set(affiliations))
    if affiliations:
        meta_content.append("affiliations:")
        for affiliation in affiliations:
            meta_content.append(f"- {affiliation}")
    
    # Pull contributors into their own taxonomies
    contributions = {}
    for c in doc['attributes']['contributors']:
        if c['contributorType'] not in contributions:
            contributions[c['contributorType']] = [c['name']]
        else:
            contributions[c['contributorType']].append(c['name'])
    if contributions:
        for contributor_type, contributors in contributions.items():
            meta_content.append(f"{contributor_type.lower()}:")
            for contributor in contributors:
                meta_content.append(f"- {contributor}")

    # Add funder labels
    if doc['attributes']['fundingReferences']:
        meta_content.append("funders:")
        for funder_name in list(set([i['funderName'] for i in doc['attributes']['fundingReferences']])):
            meta_content.append(f"- {funder_name}")

    meta_content.append("---")
    return '\n'.join(meta_content)

def datacite_md(doc, dois_in_dataset):
    md = datacite_meta(doc)
    
    abstract = next((i['description'] for i in doc['attributes']['descriptions'] if i['descriptionType'] == 'Abstract'), None)
    if abstract:
        md+= '\n\n# Abstract'
        md+= f'\n{abstract}'
    other_descriptions = [i for i in doc['attributes']['descriptions'] if i['descriptionType'] != 'Abstract']
    if other_descriptions:
        for desc in other_descriptions:
            md+= f'\n\n## {desc["descriptionType"]}'
            md+= f'\n{desc["description"].lstrip("#").strip()}'

    if doc['attributes']['url']:
        md+= f'\n\n# Access Points\n{doc["attributes"]["url"]}'

    if doc['attributes']['relatedIdentifiers']:
        md+= '\n\n# Related Identifiers'
        relatedIdentifiers = doc['attributes']['relatedIdentifiers']
        relatedIdentifiers.sort(key=lambda x: x['relationType'])
        for rel, links in groupby(relatedIdentifiers, lambda x: x['relationType']):
            md+=f"\n## {rel}"
            for item in links:
                if item['relatedIdentifierType'] == 'DOI':
                    related_doi = '/'.join(item['relatedIdentifier'].split('/')[-2:])
                    if related_doi in dois_in_dataset:
                        md+=f"\n- [{related_doi}](../../{related_doi}/)"
                    else:
                        md+=f"\n- https://doi.org/{related_doi}"
                elif item['relatedIdentifierType'] == 'ISBN':
                    md+=f"\n- http://www.worldcat.org/isbn/{item['relatedIdentifier']}"
                elif item['relatedIdentifierType'] == 'URL':
                    md+=f"\n- {item['relatedIdentifier']}"
                elif item['relatedIdentifierType'] == 'ISSN':
                    md+=f"\n- https://portal.issn.org/resource/ISSN/{item['relatedIdentifier']}"
                elif item['relatedIdentifierType'] == 'arXiv':
                    md+=f"\n- https://arxiv.org/abs/{item['relatedIdentifier'].split(':')[-1]}"
                else:
                    md+=f"\n- {item['relatedIdentifier']} ({item['relatedIdentifierType']})"

    return md

# Get DataCite Items
This part of the process is really up to the individual use case. Any type of process is perfectly fine here as long as it returns some set of items in DataCite's native JSON format. This could also be retuned to work with JSON-LD or any other output format desired. The predominant use case is likely to work through a set of items from one or more DataCite repositories (DOI prefixes) that do not otherwise have landing pages in some primary source repository. But this can also be used to spin up some particular context of assets that need to be presented in a particular way.

In [3]:
datacite_api = 'https://api.datacite.org/dois?query=fundingReferences.funderName:"National Science Foundation"&page[size]=100'
items = requests.get(datacite_api).json()
print("DataCite DOCUMENTS:", len(items['data']))

DataCite DOCUMENTS: 100


In [275]:
dois_in_dataset = [i['id'] for i in items['data']]

# Flush previous contents and build folder structure for DOI prefixes
build_sections(datacite_repositories(items['data']))

# Process documents to produce MD files with metadata
for document in items['data']:
    doi_prefix = document['id'].split('/')[0]
    doi_suffix = document['id'].split('/')[1]
    file_path = os.path.join('../content', doi_prefix, doi_suffix + '.md')
    with open(file_path, 'w') as f:
        f.write(datacite_md(document, dois_in_dataset))
        print("WROTE", file_path)

CLEARED CONTENT FOLDERS
SETUP FOLDER/FILE STRUCTURE FOR: 10.5281
SETUP FOLDER/FILE STRUCTURE FOR: 10.3886
SETUP FOLDER/FILE STRUCTURE FOR: 10.5061
SETUP FOLDER/FILE STRUCTURE FOR: 10.1594
SETUP FOLDER/FILE STRUCTURE FOR: 10.15146
SETUP FOLDER/FILE STRUCTURE FOR: 10.18128
SETUP FOLDER/FILE STRUCTURE FOR: 10.7923
SETUP FOLDER/FILE STRUCTURE FOR: 10.34726
SETUP FOLDER/FILE STRUCTURE FOR: 10.17632
SETUP FOLDER/FILE STRUCTURE FOR: 10.6078
SETUP FOLDER/FILE STRUCTURE FOR: 10.48443
SETUP FOLDER/FILE STRUCTURE FOR: 10.6076
SETUP FOLDER/FILE STRUCTURE FOR: 10.7291
WROTE ../content/10.5281/zenodo.10099732.md
WROTE ../content/10.5281/zenodo.10099731.md
WROTE ../content/10.5281/zenodo.10223759.md
WROTE ../content/10.5061/dryad.gqnk98sv9.md
WROTE ../content/10.5281/zenodo.8031759.md
WROTE ../content/10.5281/zenodo.8031760.md
WROTE ../content/10.5061/dryad.m905qfv4v.md
WROTE ../content/10.5281/zenodo.10158072.md
WROTE ../content/10.5281/zenodo.10158073.md
WROTE ../content/10.5061/dryad.7wm37pvzv.md


# Build Data Cache

In [37]:
df_items = pd.DataFrame([i['attributes'] for i in items['data']])
# df_items = df_items.convert_dtypes()
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   doi                 100 non-null    object
 1   identifiers         100 non-null    object
 2   creators            100 non-null    object
 3   titles              100 non-null    object
 4   publisher           100 non-null    object
 5   container           100 non-null    object
 6   publicationYear     100 non-null    int64 
 7   subjects            100 non-null    object
 8   contributors        100 non-null    object
 9   dates               100 non-null    object
 10  language            44 non-null     object
 11  types               100 non-null    object
 12  relatedIdentifiers  100 non-null    object
 13  relatedItems        100 non-null    object
 14  sizes               100 non-null    object
 15  formats             100 non-null    object
 16  version             36 non-

## Related Items
I need to come back to examine more we can do on related items. One thought is to pull the relationships where the target object is found within the dataset being used to build the site and then showing an interactive graph for users to use in navigating the relationships inherent in the particular site.

In [239]:
df_related_id = df_items[df_items['relatedIdentifiers'].str.len() > 0][['doi','relatedIdentifiers']].explode('relatedIdentifiers')
df_related_id = pd.concat([df_related_id.drop(['relatedIdentifiers'], axis=1), df_related_id['relatedIdentifiers'].apply(pd.Series)], axis=1)

# Organize Creators

In [362]:
df_creators = df_items[df_items['creators'].str.len() > 0][['doi','creators']].explode('creators')
df_creators = pd.concat([df_creators.drop(['creators'], axis=1), df_creators['creators'].apply(pd.Series)], axis=1)
df_creators['contributorType'] = 'author'

df_contributors = df_items[df_items['contributors'].str.len() > 0][['doi','contributors']].explode('contributors')
df_contributors = pd.concat([df_contributors.drop(['contributors'], axis=1), df_contributors['contributors'].apply(pd.Series)], axis=1)

df_contacts = pd.concat([df_creators, df_contributors])

df_contacts['orcid'] = df_contacts['nameIdentifiers'].apply(lambda x: next((i['nameIdentifier'].split('/')[-1] for i in x if i['nameIdentifierScheme'] == 'ORCID'), ''))
df_contacts['ror'] = df_contacts['nameIdentifiers'].apply(lambda x: next((i['nameIdentifier'].split('/')[-1] for i in x if i['nameIdentifierScheme'] == 'ROR'), ''))

def contact_url(row):
    if len(row['orcid']) > 0:
        return f'https://orcid.org/{row["orcid"]}'
    if len(row['ror']) > 0:
        return f'https://ror.org/{row["ror"]}'
    return ''

df_contacts['url'] = df_contacts.apply(lambda x: contact_url(x), axis=1)
df_contacts['affiliation_string'] = df_contacts['affiliation'].apply(lambda x: '; '.join(x) if x else '')
df_contacts['doi_contact'] = df_contacts.apply(lambda x: {'name': x['name'], 'orcid': x['orcid'], 'ror': x['ror'], 'affiliation': x['affiliation_string']}, axis=1)

# Package contact information for doi cache
grouped_contacts = df_contacts[['doi','contributorType','doi_contact']].groupby(['doi','contributorType'], as_index=False).agg(list)
grouped_contacts = grouped_contacts.groupby('doi').apply(lambda x: x.set_index('contributorType')['doi_contact'].to_dict()).reset_index(name='contacts')

df_geo = df_items[df_items['geoLocations'].str.len() > 0][['doi','geoLocations']].explode('geoLocations')
df_geo = pd.concat([df_geo.drop(['geoLocations'], axis=1), df_geo['geoLocations'].apply(pd.Series)], axis=1)
df_geo['bbox'] = df_geo['geoLocationBox'].apply(lambda x: [[x['southBoundLatitude'], x['westBoundLongitude']], [x['northBoundLatitude'], x['eastBoundLongitude']]] if isinstance(x, dict) else None)
df_geo['point'] = df_geo['geoLocationPoint'].apply(lambda x: [x['pointLatitude'], x['pointLongitude']] if isinstance(x, dict) else None)

df_maps = pd.concat([
    df_geo[df_geo['point'].notnull()].groupby('doi')['point'].agg(list),
    df_geo[df_geo['bbox'].notnull()].groupby('doi')['bbox'].first()
], axis=1).reset_index('doi')

datacite_core_props = [
    'doi',
    'publisher',
    'publicationYear',
    'contentUrl'
]

doi_data_file = pd.merge(
    left=df_items[datacite_core_props],
    right=grouped_contacts,
    how='left',
    on='doi'
)

doi_data_file = pd.merge(
    left=doi_data_file,
    right=df_maps,
    how='left',
    on='doi'
).fillna('')

json.dump(doi_data_file.set_index('doi').to_dict('index'), open('../data/doi.json', 'w'), indent=2)

# Package contact information for the contacts cache
contacts_core = df_contacts[['name','nameType','givenName','familyName','orcid','ror','url']].groupby('name', as_index=False).first()
contact_affiliations = df_contacts[df_contacts['affiliation'].str.len() > 0][['name','affiliation']].explode('affiliation').groupby('name', as_index=False).agg(list)
contact_affiliations['affiliation'] = contact_affiliations['affiliation'].apply(lambda x: list(set(x)))
contact_contributions = df_contacts[['name','contributorType']].groupby('name', as_index=False).agg(list)
contact_contributions['contributorType'] = contact_contributions['contributorType'].apply(lambda x: list(set(x)))

contact_reference = pd.merge(
    left=contacts_core,
    right=contact_affiliations,
    how='left',
    on='name'
) 
contact_reference = pd.merge(
    left=contact_reference,
    right=contact_contributions,
    how='left',
    on='name'
).fillna('')

json.dump(contact_reference.set_index('name').to_dict('index'), open('../data/contacts.json', 'w'), indent=2)

## Indexes
We can create what are essentially a series of local indexes from the original DataCite metadata cached in the /data/ folder for later reference from template files. Given the title of a page or a value in that page's metadata, we can retrieve additional data from the cache. Setting the index to the value of the label makes this efficient and straightforward.

### Full record on DOI

In [None]:
cache_datacite(df_items)

In [None]:
df_titles = df_items[['doi','titles']].explode('titles')
df_titles = pd.concat([df_titles.drop(['titles'], axis=1), df_titles['titles'].apply(pd.Series)], axis=1)

df_descriptions = df_items[['doi','descriptions']].explode('descriptions')
df_descriptions = pd.concat([df_descriptions.drop(['descriptions'], axis=1), df_descriptions['descriptions'].apply(pd.Series)], axis=1)

In [None]:
df_titles

In [None]:
main_title_lookup = df_titles[df_titles['titleType'].isnull()].set_index('doi')['title'].to_dict()

In [None]:
df_descriptions

In [None]:
for _, row in df_items.iterrows():
    md_elements = ['---']
    md_elements.append(f"doi: {row['doi']}")
    md_elements.append(f"publicationYear: [{row['publicationYear']}]")
    title = main_title_lookup[row["doi"]].replace('"',"'")
    md_elements.append(f'title: "{title}"')
    md_elements.append(f"categories: {[row['types']['resourceTypeGeneral']]}")
    md_elements.append(f"tags: {[i['subject'] for i in row['subjects']]}")
    md_elements.append(f"publishers: {[row['publisher']]}")

    item_contacts = contact_lookup[contact_lookup['doi'] == row['doi']]
    if not item_contacts.empty:
        for i, c in item_contacts.iterrows():
            md_elements.append(f"{c['contributorType']}: {c['label']}")

    md_elements.append('---')

    abstract = next((i['description'] for i in row['descriptions'] if i['descriptionType'] == 'Abstract'), None)

    display(md_elements)

## Contacts

In [290]:
df_creators = df_items[df_items['creators'].str.len() > 0][['doi','creators']].explode('creators')
df_creators = pd.concat([df_creators.drop(['creators'], axis=1), df_creators['creators'].apply(pd.Series)], axis=1)
df_creators['contributorType'] = 'author'

df_contributors = df_items[df_items['contributors'].str.len() > 0][['doi','contributors']].explode('contributors')
df_contributors = pd.concat([df_contributors.drop(['contributors'], axis=1), df_contributors['contributors'].apply(pd.Series)], axis=1)

df_contacts = pd.concat([df_creators, df_contributors]).reset_index(drop=True)
df_contacts['orcid'] = df_contacts['nameIdentifiers'].apply(lambda x: next((i['nameIdentifier'].split('/')[-1] for i in x if i['nameIdentifierScheme'] == 'ORCID'), ''))
df_contacts['ror'] = df_contacts['nameIdentifiers'].apply(lambda x: next((i['nameIdentifier'].split('/')[-1] for i in x if i['nameIdentifierScheme'] == 'ROR'), ''))

def contact_url(contact):
    if contact['orcid']:
        return f'https://orcid.org/{contact["orcid"]}'
    elif contact['ror']:
        return f'https://ror.org/{contact["ror"]}'
    else:
        return ''

df_contacts['url'] = df_contacts.apply(contact_url, axis=1)
df_contacts.fillna('', inplace=True)

In [305]:
contact_affiliations = df_contacts[df_contacts['affiliation'].str.len() > 0][['name','affiliation']].explode('affiliation').drop_duplicates()
contact_affiliations

Unnamed: 0,name,affiliation
0,"Choppali Sudarshan, Chetan",Arizona State University
1,"Matkar, Nikhil",Arizona State University
2,"Vrudhula, Sarma",Arizona State University
3,"Sapatnekar, Sachin",University of Minnesota
4,"Chhabria, Vidya",Arizona State University
...,...,...
1473,"Azenon, Jonathan",Cornell College
1474,"Cugley, John",Australian Speleological Federation
1475,"Woods, David",Queensland Department of Environment and Science
1476,"Humphreys, William",Western Australian Museum


In [306]:
contact_affiliations[contact_affiliations['affiliation'].isin(df_contacts['name'])]

Unnamed: 0,name,affiliation
1466,"Denniston, Rhawn",Cornell College
1473,"Azenon, Jonathan",Cornell College
1474,"Cugley, John",Australian Speleological Federation
1476,"Humphreys, William",Western Australian Museum


In [301]:
contact_roles = df_contacts[df_contacts['contributorType'].str.len() > 0][['name','contributorType']].groupby('name', as_index=False).agg(list)
contact_roles['roles'] = contact_roles['contributorType'].apply(lambda x: list(set(x)))


In [304]:
pd.merge(
    left=df_contacts.drop(columns=['doi','nameIdentifiers','affiliation','contributorType']).groupby('name', as_index=False).first(),
    right=contact_roles[['name','roles']],
    how='left',
    on='name'
)

Unnamed: 0,name,nameType,givenName,familyName,orcid,ror,url,roles
0,"Aguilar, Salomón",Personal,Salomón,Aguilar,,,,[author]
1,"Alvarez-Buylla, Aurora",Personal,Aurora,Alvarez-Buylla,0000-0001-6256-0300,,https://orcid.org/0000-0001-6256-0300,[author]
2,"An, Sizhe",Personal,Sizhe,An,0000-0002-9211-4886,,https://orcid.org/0000-0002-9211-4886,[author]
3,"Angela, Mercia",Personal,Mercia,Angela,,,,[author]
4,Anh Thu Nguyen,Personal,,Anh Thu Nguyen,,,,[author]
...,...,...,...,...,...,...,...,...
214,"Woods, David",Personal,David,Woods,,,,[Other]
215,"Woodson, C. Brock",Personal,C. Brock,Woodson,,,,[author]
216,"Yan, Lingfeng",Personal,Lingfeng,Yan,,,,[author]
217,"Yardumian, Aram",Personal,Aram,Yardumian,,,,[DataCollector]


In [291]:
df_contacts[
    (df_contacts['nameType'] == 'Organizational')
]

Unnamed: 0,doi,name,nameType,givenName,familyName,affiliation,nameIdentifiers,contributorType,orcid,ror,url
106,10.48443/ghry-qw46,National Ecological Observatory Network (NEON),Organizational,,,[],"[{'schemeUri': 'https://www.re3data.org/', 'na...",author,,,
107,10.48443/10dn-8031,National Ecological Observatory Network (NEON),Organizational,,,[],"[{'schemeUri': 'https://www.re3data.org/', 'na...",author,,,
108,10.48443/xmbe-7b55,National Ecological Observatory Network (NEON),Organizational,,,[],"[{'schemeUri': 'https://www.re3data.org/', 'na...",author,,,
1463,10.5281/zenodo.10210959,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,
1464,10.5281/zenodo.10210958,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,
...,...,...,...,...,...,...,...,...,...,...,...
1544,10.5281/zenodo.10210682,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,
1545,10.5281/zenodo.10210692,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,
1546,10.5281/zenodo.10210691,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,
1547,10.5281/zenodo.10210688,International Ocean Discovery Program,Organizational,,,[],[],DataCollector,,,


In [None]:
df_contacts[df_contacts['affiliation'].str.len() > 0][['doi','affiliation']].explode('affiliation').groupby('affiliation').agg(list)['doi'].to_dict()

In [None]:
df_contacts[df_contacts['doi'] == '10.5061/dryad.c866t1gdc']

In [None]:
contact_lookup = df_contacts[['label','contributorType','doi']].groupby(['doi','contributorType'], as_index=False).agg(list)
contact_lookup

## Funders
The main thing we get from looking at funders is a ROR identifier in some cases that helps to disambiguate named organizations. For now, I am dumping a cache of funders indexed to their funderName values that were used when generating the content pages. I include ROR and a URL form for ease of use. I also include a list of awards in the data. These can include further details on award numbers, award titles, and some resolvable identifiers.

In [281]:
df_funders = df_items[df_items['fundingReferences'].str.len() > 0][['doi','fundingReferences']].explode('fundingReferences')
df_funders = pd.concat([df_funders.drop(['fundingReferences'], axis=1), df_funders['fundingReferences'].apply(pd.Series)], axis=1)
df_funders['ror'] = df_funders.apply(lambda x: x['funderIdentifier'].split('/')[-1] if x['funderIdentifierType'] == 'ROR' else None, axis=1)
df_funders['url'] = df_funders['ror'].apply(lambda x: f'https://ror.org/{x}' if x else None)

def award(row):
    award = {
        'doi': row['doi']
    }
    for key in row.keys():
        if key.startswith('award'):
            award[key] = row[key] if isinstance(row[key], str) else ''
    return award

df_funders['award'] = df_funders.apply(award, axis=1)

grouped_df = df_funders.groupby('funderName').agg({'award': lambda x: x.tolist(), 'ror': 'first', 'url': 'first'}).reset_index()

output_dict = {}
for _, row in grouped_df.iterrows():
    output_dict[row['funderName']] = {
        'award': row['award'],
        'ror': row['ror'],
        'url': row['url']
    }

json.dump(output_dict, open('../data/funders.json', 'w'), indent=2)


## Subjects

In [None]:
df_subjects = df_items[df_items['subjects'].str.len() > 0][['doi','subjects']].explode('subjects')
df_subjects = pd.concat([df_subjects.drop(['subjects'], axis=1), df_subjects['subjects'].apply(pd.Series)], axis=1)
json.dump(df_subjects[['subject','doi']].groupby('subject', as_index=False).agg(list).set_index('subject')['doi'].to_dict(), open('../data/tags.json', 'w'), indent=2)
tag_lookup = df_subjects[['subject','doi']].groupby('doi', as_index=False).agg(list).set_index('doi')['subject'].to_dict()


# Break out funders

In [None]:
funders, funder_index = datacite_funders(items['data'])
with open('../data/funders.json', 'w') as f:
    json.dump(funders, f, indent=2)

# Build out person and organization contacts

In [None]:
contact_lookup = contact_taxonomies(label_contacts(concat_contacts(items['data'])))
for taxonomy, taxonomy_items in contact_lookup.items():
    with open(f'../data/{taxonomy}.json', 'w') as f:
        json.dump(taxonomy_items, f, indent=2)
contact_lookup.keys()

# Build DataCite Repository Sections
We want the DOI prefixes in our collection from DataCite to act as sections within the Hugo site. This means setting up root folders within /content/ for each DOI prefix in our recordset returned from the DataCite API. We'll them write markdown files to these with the remainder of the DOI identifier to provide logical paths at the root of our site that match the DOI. We also write _index.md files into each DOI prefix folder so that it is treated as a section in Hugo's architecture. This will also provide a listing of items at that path depending on the template used.

# Setup Taxonomy
The taxonomies create the navigational structure for the site. They are entirely dependent on how we process DataCite metadata and what we build from that into each content item for the site.

Two taxonomies are already established - categories and tags. We place resourceTypeGeneral values into category (e.g., dataset, model, etc.). We place all "non-parsed" subjects into tags. In practice, very few DataCite records take advantage of the ability to provide URI values for individual terms or subject scheme information that will break up tags into logical taxonomies. So, this ends up being the majority of the work - using the content as provided and getting it validated and organized into better groupings.

We also have several parts of the DataCite schema that logically break out into taxonomies:
- authors - DataCite requires at least one "creator," which we often put under the term "author" in common practice. This can be reconfigured based on preference. Additional contributors can also be organized from source.
- affiliations - Creator affiliations can be broken out and included in their own taxonomy.
- publishers - The publisher field is required in DataCite and will be populated with string values that may need more clarification.
- funders - Funding institutions are often included and may incorporate additional details that can be organized into data files for use.

In [None]:
names = ['affiliations', 'funders', 'publishers']
names.extend(contact_lookup.keys())
layouts_folder = '../layouts'
themes_folder = '../themes/lpiab-theme/layouts/_default'

setup_taxonomy(layouts_folder, themes_folder, names)

# Hugo Config File
There are a number of things that need to be set up in the Hugo configuration file. Most of this has to do with setting up the [taxonomies](https://gohugo.io/content-management/taxonomies/) that we'll be using, which is essentially the key aspect of processing DataCite repository records into their most useful form, providing several simple ways to browse through and find content of interest.

You can manage the config file however you want. I've provided one option here consisting of a site_config dictionary object that you can tweak here in the notebook and then dump to YAML.

In [None]:
# Set this parameter based on where you are deploying the site
# Note that testing locally with hugo server will "ignore" this parameter
# (other than the subdomain) and use localhost:1313 (or whatever port you specify)
base_url = 'https://datapurifier.github.io/landingpage-in-a-box/'

site_config = {
    'baseURL': base_url,
    'languageCode': 'en-us',
    'title': 'Landingpage-in-a-Box',
    'theme': 'lpiab-theme',
    'taxonomies': {
        'category': 'categories',
        'tag': 'tags',
        'publishers': 'publishers',
        'author': 'author',
        'affiliations': 'affiliations',
        'funders': 'funders',
        'DataCollector': 'DataCollector',
        'Other': 'Other', 
        'HostingInstitution': 'HostingInstitution', 
        'Sponsor': 'Sponsor'
    },
    'params': {
        'profileMode': {
            'enabled': True,
            'title': 'Landingpage-in-a-Box',
            'subtitle': 'A lightweight approach to a landing page web site built from identifier registry metadata',
            'buttons': [
                {
                    'name': 'Search', 
                    'url': 'search'
                },
                {
                    'name': 'Tags',
                    'url': 'tags'
                },
                {
                    'name': 'Categories', 
                    'url': 'categories'
                },
                {
                    'name': 'Affiliations', 
                    'url': 'affiliations'
                },
                {
                    'name': 'Publishers', 
                    'url': 'publishers'
                },
                {
                    'name': 'Funders',
                    'url': 'funders'
                }
            ]
        }
    },
    'ShowRssButtonInSectionTermList': True,
    'outputs': {
        'home': [
            'HTML', 
            'RSS', 
            'JSON'
        ]
    }
}

for url, name in contributorType.items():
    if url in contact_lookup:
        site_config['params']['profileMode']['buttons'].append({'name': name, 'url': url})

write_config(site_config)

# Process DataCite Records
This part of the code will continue to evolve as I work out details on what all should be placed into the markdown representation of DataCite documents and what all needs to be organized out into useful reference files (data objects in the Hugo site). Each web site document built from a DataCite document will have lists of labels in their metadata that populate the configured taxonomies. Some of these are simple name-only values that have no real further depth. Others labels associated with additional information contained in DataCite metadata (e.g., ORCIDs associated with creators/authors). Some of these can be exploited through additional processing to pull in further information from other sources. We can also process things like lists of unqualified subjects to break them out into more specific taxonomies (e.g., place names).