In [1]:
import json, zipfile, os, sys, unicodedata, time, datetime, logging as lg
import requests
from abbrev_state import abbrev_state
from keys import api_key
from subprocess import call

In [2]:
server = 'https://dataverse.harvard.edu'
attempts_max = 3      #how many times to re-try same file upload after error before giving up
pause_error = 60     #seconds to pause after an error
pause_normal = 10     #seconds to pause between uploads
upload_timeout = 300 #how long to set the timeout for upload post requests

In [3]:
my_doi = 'doi:10.7910/DVN/CUWWYJ' #doi of a dataverse dataset to upload files into
manifests = [{
                'doi':my_doi, #where to upload to
                'folder':'data/usa-cities-graphml', #folder of zip files to upload
                'file_desc':'Zip file contains the GraphML files of {}\'s cities/towns\' street networks.',
                'file_tags':['Data', 'GraphML', 'Cities/Towns']
             },
             {
                'doi':my_doi, 
                'folder':'data/usa-cities-shapefiles',
                'file_desc':'Zip file contains the shapefiles of {}\'s cities/towns\' street networks.',
                'file_tags':['Data', 'Shapefiles', 'Cities/Towns'] 
             },
             {
                'doi':my_doi, 
                'folder':'data/usa-neighborhoods-graphml',
                'file_desc':'Zip file contains the GraphML files of {}\'s neighborhoods\' street networks.',
                'file_tags':['Data', 'GraphML', 'Neighborhoods']
             },
             {
                'doi':my_doi, 
                'folder':'data/usa-neighborhoods-shapefiles',
                'file_desc':'Zip file contains the shapefiles of {}\'s neighborhoods\' street networks.',
                'file_tags':['Data', 'Shapefiles', 'Neighborhoods'] 
             },
             {
                'doi':my_doi, 
                'folder':'data/usa-urbanized_areas-graphml',
                'file_desc':'Zip file contains the GraphML files of {}\'s urbanized areas\' street networks.',
                'file_tags':['Data', 'GraphML', 'Urbanized Areas']
             },
             {
                'doi':my_doi, 
                'folder':'data/usa-urbanized_areas-shapefiles',
                'file_desc':'Zip file contains the shapefiles of {}\'s urbanized areas\' street networks.',
                'file_tags':['Data', 'Shapefiles', 'Urbanized Areas'] 
             }]

In [4]:
def log(message, level=lg.INFO, name='fp', filename='fp'):

    # get the current logger (or create a new one, if none), then log message at requested level
    logger = get_logger(level=level, name=name, filename=filename)
    if level == lg.DEBUG:
        logger.debug(message)
    elif level == lg.INFO:
        logger.info(message)
    elif level == lg.WARNING:
        logger.warning(message)
    elif level == lg.ERROR:
        logger.error(message)
    
    # print to console
    standard_out = sys.stdout
    sys.stdout = sys.__stdout__
    message = unicodedata.normalize('NFKD', str(message)).encode('ascii', errors='replace').decode()
    print(message)
    sys.stdout = standard_out

In [5]:
def get_logger(level, name, filename, folder='logs'):

    logger = lg.getLogger(name)

    # if a logger with this name is not already set up
    if not getattr(logger, 'handler_set', None):

        # get today's date and construct a log filename
        todays_date = datetime.datetime.today().strftime('%Y_%m_%d')
        log_filename = '{}/{}_{}.log'.format(folder, filename, todays_date)

        # if the logs folder does not already exist, create it
        if not os.path.exists(folder):
            os.makedirs(folder)

        # create file handler and log formatter and set them up
        handler = lg.FileHandler(log_filename, encoding='utf-8')
        formatter = lg.Formatter('%(asctime)s %(levelname)s %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(level)
        logger.handler_set = True

    return logger

In [6]:
# zip a file, open it, and return the buffer
# this will double-zip the zip files because dataverse unzips zip files when they are uploaded
# the result is that dataverse hosts the original zip file
def get_file_to_upload(file_path, archive_name, upload_filepath='data/temp_upload.zip'):

    zf = zipfile.ZipFile(file=upload_filepath, mode='w')
    zf.write(file_path, arcname=archive_name)
    zf.close()
    
    file = {'file' : open(upload_filepath, 'rb')}
    return file

In [7]:
# configure the file description and tags that appear on dataverse
def get_payload_to_upload(file_desc, file_tags, filename):
    
    # convert 2-digit state abbreviation to full state name and add it to description and tags
    state_abbrev = filename.split('-')[1]
    state_name = abbrev_state[state_abbrev]
    file_desc = file_desc.format(state_name)
    file_tags = file_tags + [state_name]
    
    params = {'description':file_desc, 'categories':file_tags}
    param_str = json.dumps(params)
    payload = {'jsonData':param_str}
    return payload

In [8]:
# upload a file to a dataverse dataset
def upload_file(folder, filename, doi, file_desc, file_tags, attempt_count=1):

    file_path = '{}/{}'.format(folder, filename)
    response = None
    
    # set up the api endpoint, open the file, and make the payload
    endpoint = 'api/datasets/:persistentId/add?persistentId={}&key={}'.format(doi, api_key)
    url = '{}/{}'.format(server, endpoint)
    file = get_file_to_upload(file_path=file_path, archive_name=filename)
    payload = get_payload_to_upload(file_desc=file_desc, file_tags=file_tags, filename=filename)
    
    try:
        # upload the file to the server
        log('uploading "{}" to {}'.format(filename, doi))
        start_time = time.time()
        session = requests.Session()
        response = session.post(url, data=payload, files=file, timeout=upload_timeout)
        log('response {} in {:,.1f} seconds'.format(response.status_code, time.time()-start_time))
        
        # check if the server response is ok, if not, throw exception
        response_json = response.json()
        if 'status' in response_json and not response_json['status'] == 'OK':
            raise Exception(response_json['message'])
        
        session.close()
        time.sleep(pause_normal)
    
    except Exception as e:
        
        # if any exception is thrown, log it, and retry the upload if we haven't exceeded max number of tries
        log(e, level=lg.ERROR)
        session.close()
        reboot_router()
        time.sleep(pause_error)
        refresh_ip()
        time.sleep(pause_error / 3)
        
        if attempt_count < attempts_max:
            attempt_count += 1
            log('re-trying (attempt {} of {})'.format(attempt_count, attempts_max))
            response = upload_file(folder, filename, doi, file_desc, file_tags, attempt_count=attempt_count)
        else:
            log('no more attempts for this file, we give up', level=lg.WARN)
    
    return response

In [9]:
# get all the filenames that currently exist in a dataset
def get_previously_uploaded_filenames(dataset_doi):
    endpoint = 'api/datasets/:persistentId/versions/:draft/files?key={}&persistentId={}'.format(api_key, dataset_doi)
    url = '{}/{}'.format(server, endpoint)
    response = requests.get(url)
    response_json = response.json()
    if 'data' in response_json and len(response_json['data']) > 0:
        uploaded_files = response_json['data']
        uploaded_filenames = [file['dataFile']['filename'] for file in uploaded_files]
    else:
        uploaded_filenames = []
    return uploaded_filenames

In [10]:
def refresh_ip():
    
    commands = ['ipconfig /release',
                'ipconfig /flushdns',
                'ipconfig /renew']
    
    for command in commands:
        call(command, shell=True)
        time.sleep(1)

In [11]:
def reboot_router(router_address='192.168.1.1'):
    
    import telnetlib
    from keys import router_username, router_password
    
    log('rebooting router...')
    tn = telnetlib.Telnet(host=router_address, port=23, timeout=5)
    tn.read_until(b'login: ')
    tn.write(router_username + b'\r\n')
    tn.read_until(b'Password: ')
    tn.write(router_password + b'\r\n')
    tn.write(b'reboot\r\n')
    tn.write(b'exit\r\n')
    display = tn.read_all()
    tn.close()
    return display

## Run the script

In [12]:
log('script started')

In [13]:
for manifest in manifests:
    
    log('manifest={}'.format(manifest))
    
    # get list of filenames in this folder to upload
    folder = manifest['folder']
    file_desc = manifest['file_desc']
    file_tags = manifest['file_tags']
    doi = manifest['doi']
    already_uploaded = get_previously_uploaded_filenames(doi)
    
    # upload each file
    filenames = os.listdir(folder)
    for filename in filenames:
        if not filename in already_uploaded:
            # if it's not already on the server, upload it
            response = upload_file(folder, filename, doi, file_desc, file_tags)
        else:
            log('skipping "{}" because it is already on the server'.format(filename))

In [14]:
log('script finished')