# Publish SAEF Inventory

## About
Script to publish all SAEF datasets in an inventory whose drafts were created on the Harvard Dataverse Repository. This script can be adapted to support publishing of future SAEF datasets.

- **Created:** 2023/02/13
- **Last update:** 2023/02/17

## Globals
Define global variables.

In [None]:
g_saef_module_path = '../src'
# dataset inventory
g_saef_dataset_inventory = '../inventory/saef_hdv_dataset_inventory.csv'
# batch file
g_saef_batch_file = './log/saef_hdv_batches.csv'
# batch log file
g_saef_batch_log = './log/saef_hdv_batch_log.csv'
# batch log headers
g_saef_batch_log_headers = ['date', 'dataset_doi', 'operation', 'status']
# batch size (number of datasets per batch)
g_saef_batch_size = 10
# installation url
g_dataverse_installation_url = 'https://dataverse.harvard.edu'
# collection url
g_dataverse_collection_url = 'SAEF'
# dataverse.harvard.edu API key
g_dataverse_api_key = ''

In [None]:
import sys
if g_saef_module_path not in sys.path:
    sys.path.append(g_saef_module_path)

## Modules

In [None]:
import os
import numpy as np
import pandas as pd

## Functions
Local functions to support creating and publishing batches. Eventually, these functions will be moved into a module for broader use.

In [None]:
def initialize_log(filename, cols):
    """
    Initialize the log file

    Parameters
    ----------
    filename : str
        Full path to log file
    cols : list
        List of string column headers

    Return 
    ------
    bool
    
    """
    if ((not filename) or 
        (not cols) or
        (not len(cols) > 0)):
        return False
    
    # does file exist?
    if (os.path.isfile(filename) == True):
        # is the file tabular with proper headers?
        df = pd.read_csv(filename)
        if (''.join(df.columns) == ''.join(cols)):
            return True
        else:
            print('initialize_log::Error - column name mismatch: {} vs. {}'.format(df.columns,cols))
            return False
    else: 
        # create the dataframe columns
        df = pd.DataFrame(columns=cols)
        # write the dataframe to the log file
        df.to_csv(g_saef_batch_log,index=False)

    return True

In [None]:
def write_log(filename, msg):
    """
    Write a status message to the log file.

    Parameters
    ----------
    filename : filename
        Full path of logfile
    msg : list
        Well-formated message to log. Format: [date, dataset_doi, operation, status]

    Return
    ------
    bool
    """
    if (not (filename) or 
        ((not msg))):
        return False

    df = pd.read_csv(filename,header=0)
    if (len(df.columns) == len(msg)):
        df.loc[len(df.index)] = [msg[0],msg[1],msg[2],msg[3]]
        df.to_csv(filename,index=False)
        return True
    else:
        print('write_log:: Error - mismatch row lengths')
        return False

In [None]:
def create_batches(filename, batch_size, inventory_df):
    """
    Given an inventory of datasets and a batch size, create a file containing batches.

    Parameters
    ----------
    filename : str
        Batch file 
    batch_size : int
        Number of datasets per batch
    inventory_df : DataFrame
        DataFrame containing inventory of datasets

    Return
    ------
    bool
    """

    if ((not filename) or
        (not batch_size) or
        (inventory_df.empty == True)):
        return False

    # get series of dataset_dois
    dataset_dois = inventory_df['dataset_doi']
    # calculate the number of batches
    num_batches = len(dataset_dois) // batch_size
    # create the array of batches of dataset dois
    batches = np.array_split(dataset_dois, batch_size)
    # write the batches to a datafame
    df = pd.DataFrame()
    df['dataset_dois'] = ''
    for batch in batches:
        df.loc[len(df.index)] = [';'.join(batch)]
    # write the batch file
    df.to_csv(filename)
    
    return True

In [None]:
def publish_dataset(api, dataset_pid, version='major'):
        """
        Publish a dataset associated with a collection

        Parameters
        ----------
        api : pyDataverse api
        dataset_pid : str
            DOI of dataset to delete from the collection
        version : str (default: major)
            Either 'major' or 'minor' version

        Return
        ------
        bool
        """
        # validate parameters
        if ((not api) or
            (not dataset_pid) or 
            ((not(version == 'major')) and
            (not(version == 'minor')))):
            print('publish_dataset::Error - validation of parameters failed')
            return False
    
        import requests
        # get the base url
        base_url = api.base_url
        # get the api token
        api_token = api.api_token
        # create the headers
        headers = {'X-Dataverse-key': api_token, 'Content-Type' : 'application/json'}
        # create the request url
        request_url = '{}/api/datasets/:persistentId/actions/:publish?persistentId={}&type={}'.format(base_url, dataset_pid, version) 

        # call the requests library using the request url
        response = requests.post(request_url, headers=headers)
        
        # handle responses
        status = response.status_code
        if (not (status >= 200 and status < 300)):
            print('publish_dataset::Error - failed to publish dataset: {}:{}'.format(status,dataset_pid))
            return False
        else:
            print('publish_dataset::Success - published dataset: {}:{}'.format(status,dataset_pid))
            return True

In [None]:
def publish_datasets(api, dataset_dois,logfile):
    """
    Publish each dataset in a list. Logs result to log dataframe

    Parameter
    ---------
    api : pyDataverse api
    datasets : list
        List of dataset dois
    logfile : str
        Filename to write events

    Return
    ------
    bool
    """
    if ((not api) or
        (not dataset_dois)):
        return False

    # publish each dataset in the list
    for doi in dataset_dois:
        # publish the dataset
        status = publish_dataset(api, doi, version='major')
        print('publish_datasets: {}:{}'.format(doi,status))
        # log the event
        from datetime import datetime
        date = datetime.now()
        msg = [date,doi,'publish_dataset',status]
        write_log(logfile, msg)
    return True

### Create the SAEF dataset batches and batch file


In [None]:
# read the dataset inventory
dataset_inventory_df = pd.read_csv(g_saef_dataset_inventory,header=0)
# print documentation
print('create_batches {}'.format(create_batches.__doc__))
# create the batch file
print('Create SAEF dataset batches: {}'.format(create_batches(g_saef_batch_file, 
                                                                g_saef_batch_size, 
                                                                dataset_inventory_df)))

### Initialize the log file
Create the file if necessary, otherwise, open it

In [None]:
# print documentation
print('create_batches {}'.format(initialize_log.__doc__))
# initialize the log
print('Initialize batch log: {} - {}'.format(g_saef_batch_log, 
                                            initialize_log(g_saef_batch_log, g_saef_batch_log_headers)))

### Initialize pyDataverse API
- **Description:** Initialize the `pyDataverse` API adapter

In [None]:
# import pyDataverse packages
from pyDataverse.api import NativeApi

# create pyDataverse API adapter
api = NativeApi(g_dataverse_installation_url, g_dataverse_api_key)

print('{}'.format(api))

### Publish each batch
First, read the batch file into a DataFrame`

In [None]:
# read batch file
batches_df = pd.read_csv(g_saef_batch_file)
if (batches_df.empty == False):
    print('Successfully read: {} - {} rows'.format(g_saef_batch_file, len(batches_df)))

#### Batch 00: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[0,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 01: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[1,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 02: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[2,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

**End document.**

### Batch 03: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[3,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 04: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[4,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 05: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[5,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 06: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[6,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 07: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[7,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 08: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[8,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))

### Batch 09: Publish datasets

In [None]:
# get the batches at the proper index
batch = batches_df.at[9,'dataset_dois']
# get the list of dois
dois = batch.split(';')
# publish the datasets in the list
status = publish_datasets(api, dois, g_saef_batch_log)
print('Publish batch: {}'.format(status))