# Upload to Dataverse


Using the dataverse native and sword APIs. Log into https://dataverse.harvard.edu to create an API key.

  - Create a draft dataset revision on the dataverse (edit > edit metadata > change something > save)
  - Run this notebook
  - Visit dataverse and publish the revised draft
  
The sword api is needed to delete files as this hasn't been implemented in the native api yet. The native api handles all the file uploading and metadata (that the sword only offers limited support for).

  - sword api: http://guides.dataverse.org/en/4.8.6/api/sword.html#delete-a-file-by-database-id
  - native api: http://guides.dataverse.org/en/4.8.6/api/native-api.html

In [1]:
import config
import json
import keys
import logging as lg
import os
import osmnx as ox
import requests
import time
import xmltodict
import zipfile

ox.config(log_console=True, log_file=True, log_filename='upload-dataverse')

In [2]:
delete_existing_files = False #only make true on the first run to clear out everything from the draft
debug_mode = False

In [3]:
# configure the dataverse upload
server = 'https://dataverse.harvard.edu'
attempts_max = 3     #how many times to re-try same file upload after error before giving up
pause_error = 10     #seconds to pause after an error
pause_normal = 2    #seconds to pause between uploads
upload_timeout = 300 #how long to set the timeout for upload post requests

In [4]:
# load state fips code to state name dict
with open('input_data/states_by_fips.json') as f:
    fips_to_state = json.load(f)

abbrev_state = {v['abbreviation']:v['name'] for k, v in fips_to_state.items()}

In [5]:
# define what to upload
manifests = [{
              'folder':'{}/counties-street_networks-graphml', #folder of zip files to upload
              'file_desc':'Zip file contains the GraphML files of {}\'s counties\' street networks.',
              'file_tags':['Data', 'GraphML', 'Counties', 'Street Network']
             },
             {
              'folder':'{}/counties-street_networks-shapefiles',
              'file_desc':'Zip file contains the shapefiles of {}\'s counties\' street networks.',
              'file_tags':['Data', 'Shapefiles', 'Counties', 'Street Network'] 
             },
             {
              'folder':'{}/counties-street_networks-node_edge_lists',
              'file_desc':'Zip file contains the node/edge lists of {}\'s counties\' street networks.',
              'file_tags':['Data', 'Node/Edge Lists', 'Counties', 'Street Network'] 
             },
             {
              'folder':'{}/cities-street_networks-graphml',
              'file_desc':'Zip file contains the GraphML files of {}\'s cities/towns\' street networks.',
              'file_tags':['Data', 'GraphML', 'Cities/Towns', 'Street Network']
             },
             {
              'folder':'{}/cities-street_networks-shapefiles',
              'file_desc':'Zip file contains the shapefiles of {}\'s cities/towns\' street networks.',
              'file_tags':['Data', 'Shapefiles', 'Cities/Towns', 'Street Network'] 
             },
             {
              'folder':'{}/cities-street_networks-node_edge_lists',
              'file_desc':'Zip file contains the node/edge lists of {}\'s cities/towns\' street networks.',
              'file_tags':['Data', 'Node/Edge Lists', 'Cities/Towns', 'Street Network'] 
             },
             {
              'folder':'{}/neighborhoods-street_networks-graphml',
              'file_desc':'Zip file contains the GraphML files of {}\'s neighborhoods\' street networks.',
              'file_tags':['Data', 'GraphML', 'Neighborhoods', 'Street Network']
             },
             {
              'folder':'{}/neighborhoods-street_networks-shapefiles',
              'file_desc':'Zip file contains the shapefiles of {}\'s neighborhoods\' street networks.',
              'file_tags':['Data', 'Shapefiles', 'Neighborhoods', 'Street Network'] 
             },
             {
              'folder':'{}/neighborhoods-street_networks-node_edge_lists',
              'file_desc':'Zip file contains the node/edge lists of {}\'s neighborhoods\' street networks.',
              'file_tags':['Data', 'Node/Edge Lists', 'Neighborhoods', 'Street Network'] 
             },
             {
              'folder':'{}/urbanized_areas-street_networks-graphml',
              'file_desc':'Zip file contains the GraphML files of {}\'s urbanized areas\' street networks.',
              'file_tags':['Data', 'GraphML', 'Urbanized Areas', 'Street Network']
             },
             {
              'folder':'{}/urbanized_areas-street_networks-shapefiles',
              'file_desc':'Zip file contains the shapefiles of {}\'s urbanized areas\' street networks.',
              'file_tags':['Data', 'Shapefiles', 'Urbanized Areas', 'Street Network'] 
             },
             {
              'folder':'{}/urbanized_areas-street_networks-node_edge_lists',
              'file_desc':'Zip file contains the node/edge lists of {}\'s urbanized areas\' street networks.',
              'file_tags':['Data', 'Node/Edge Lists', 'Urbanized Areas', 'Street Network'] 
             },
             {
              'folder':'{}/tracts-street_networks-graphml',
              'file_desc':'Zip file contains the GraphML files of {}\'s census tracts\' street networks.',
              'file_tags':['Data', 'GraphML', 'Census Tracts', 'Street Network']
             },
             {
              'folder':'{}/tracts-street_networks-shapefiles',
              'file_desc':'Zip file contains the shapefiles of {}\'s census tracts\' street networks.',
              'file_tags':['Data', 'Shapefiles', 'Census Tracts', 'Street Network'] 
             },
             {
              'folder':'{}/tracts-street_networks-node_edge_lists',
              'file_desc':'Zip file contains the node/edge lists of {}\'s census tracts\' street networks.',
              'file_tags':['Data', 'Node/Edge Lists', 'Census Tracts', 'Street Network'] 
             }]

## Helper functions

In [6]:
# zip a staged zipped file, open it, and return the buffer
# this will double-zip the zip files because dataverse unzips zip files when they are uploaded
# the result is that dataverse hosts the original zipped file
def get_file_to_upload(file_path, archive_name, upload_file='temp_upload.zip'):
    
    upload_filepath = '{}/{}'.format(config.staging_folder, upload_file)
    file_path = file_path.format(config.staging_folder)
    
    zf = zipfile.ZipFile(file=upload_filepath, mode='w')
    zf.write(file_path, arcname=archive_name)
    zf.close()
    
    file = {'file':open(upload_filepath, mode='rb')}
    return file

In [7]:
# configure the file description and tags that appear on dataverse
def get_payload_to_upload(file_desc, file_tags, filename):
    
    # convert 2-digit state abbreviation to full state name and add it to description and tags
    state_abbrev = filename[3:5]
    state_name = abbrev_state[state_abbrev]
    file_desc = file_desc.format(state_name)
    file_tags = file_tags + [state_name]
    
    params = {'description':file_desc, 'categories':file_tags}
    param_str = json.dumps(params)
    payload = {'jsonData':param_str}
    return payload

In [8]:
# upload a new file to a dataverse dataset
def upload_new_file(folder, filename, doi, file_desc, file_tags, attempt_count=1):

    file_path = '{}/{}'.format(folder, filename)
    response = None
    
    # set up the api endpoint, open the file, and make the payload
    endpoint = 'api/v1/datasets/:persistentId/add?persistentId={}&key={}'.format(doi, keys.api_key)
    url = '{}/{}'.format(server, endpoint)
    file = get_file_to_upload(file_path=file_path, archive_name=filename)
    payload = get_payload_to_upload(file_desc=file_desc, file_tags=file_tags, filename=filename)
    
    try:
        # upload the file to the server
        ox.log('uploading "{}" to {}'.format(filename, doi))
        
        if debug_mode:
            pass
        else:
            start_time = time.time()
            session = requests.Session()
            response = session.post(url, data=payload, files=file, timeout=upload_timeout)
            ox.log('response {} in {:,.1f} seconds'.format(response.status_code, time.time()-start_time))

            # check if the server response is ok, if not, throw exception
            response_json = response.json()
            if 'status' in response_json and not response_json['status'] == 'OK':
                raise Exception(response_json['message'])

            session.close()
            time.sleep(pause_normal)
    
    except Exception as e:
        
        # if any exception is thrown, log it, and retry the upload if we haven't exceeded max number of tries
        ox.log(e, level=lg.ERROR)
        session.close()
        time.sleep(pause_error)
        
        if attempt_count < attempts_max:
            attempt_count += 1
            ox.log('re-trying (attempt {} of {})'.format(attempt_count, attempts_max))
            response = upload_new_file(folder, filename, doi, file_desc, file_tags, attempt_count=attempt_count)
        else:
            ox.log('no more attempts for this file, we give up', level=lg.WARN)
    
    return response

In [9]:
# get all the filenames that currently exist in the DRAFT dataset
def get_uploaded_draft_filenames(dataset_doi):
    
    endpoint = 'api/v1/datasets/:persistentId/versions/:draft/files?key={}&persistentId={}'
    url = '{}/{}'.format(server, endpoint).format(keys.api_key, dataset_doi)
    response = requests.get(url)
    response_json = response.json()
    
    if 'data' in response_json and len(response_json['data']) > 0:
        uploaded_files = response_json['data']
        uploaded_filenames = [file['dataFile']['filename'] for file in uploaded_files]
    else:
        uploaded_filenames = []
    
    return uploaded_filenames

In [10]:
# get all the filenames that currently exist in the latest published dataset
def get_published_files(dataset_doi):
    
    endpoint = 'api/v1/datasets/:persistentId/versions/:latest-published/files?key={}&persistentId={}'
    url = '{}/{}'.format(server, endpoint).format(keys.api_key, dataset_doi)
    response = requests.get(url)
    response_json = response.json()
    
    if 'data' in response_json and len(response_json['data']) > 0:
        filelist = response_json['data']
        published_files = {file['dataFile']['filename']:file['dataFile']['id'] for file in filelist}
    else:
        published_files = {}
    
    return published_files

In [11]:
def delete_dataset_files(doi=keys.my_doi):
    """
    Delete all files from draft dataset at the given DOI.
    """

    host = 'dataverse.harvard.edu'
    url_statement = 'https://{}/dvn/api/data-deposit/v1.1/swordv2/statement/study/{}'.format(host, doi)
    auth = (keys.api_key, None)
    response = requests.get(url_statement, auth=auth)
    assert response.status_code == 200
    
    response_dict = xmltodict.parse(response.text)
    files = response_dict['feed']['entry']
    ox.log('There are {} files to delete'.format(len(files)))
    all_start_time = time.time()
    
    i = 0
    for file in files:
        
        file_name = file['id'].split('/')[-1]
        file_id = file['id'].split('/')[-2]
        url_delete = 'https://{}/dvn/api/data-deposit/v1.1/swordv2/edit-media/file/{}'.format(host, file_id)
        auth = (keys.api_key, None)
        
        start_time = time.time()
        response = requests.delete(url_delete, auth=auth)
        ox.log('{} Deleted "{}" in {:.1f} seconds'.format(response, file_name, time.time()-start_time))
        assert response.status_code == 204
        i += 1
    
    ox.log('Deleted {} files in {} seconds'.format(i, int(time.time()-all_start_time)))

## Run the script

In [12]:
start_time = time.time()
ox.log('script started')

In [13]:
# what files have already been uploaded to the draft?
already_uploaded = get_uploaded_draft_filenames(keys.my_doi)

# what files exist in the published version of the dataset?
published_files = get_published_files(keys.my_doi)

print(len(published_files))
print(len(already_uploaded))

290
765


In [14]:
if delete_existing_files:
    
    # delete all the existing (carried-over) files in the draft dataset
    delete_dataset_files()
    
    # what files have already been uploaded to the draft?
    already_uploaded = get_uploaded_draft_filenames(keys.my_doi)

    # what files exist in the published version of the dataset?
    published_files = get_published_files(keys.my_doi)

    print(len(published_files))
    print(len(already_uploaded))

In [15]:
for manifest in manifests:
    
    ox.log('manifest={}'.format(manifest))
    
    # get list of filenames in this folder to upload
    folder = manifest['folder']
    file_desc = manifest['file_desc']
    file_tags = manifest['file_tags']

    # upload each file
    filenames = os.listdir(folder.format(config.staging_folder))
    for filename in filenames:
        
        # if it's not already in the draft revision on the server, upload it
        if not filename in already_uploaded:
            response = upload_new_file(folder, filename, keys.my_doi, file_desc, file_tags)
        else:
            ox.log('skipping "{}" because it is already on the server'.format(filename))

In [16]:
ox.log('script finished in {} seconds'.format(int(time.time()-start_time)))