## CKAN helper functions

In [1]:
import requests, json, urllib
from secret import CKAN

url = CKAN["dpaw-internal"]["url"]
key = CKAN["dpaw-internal"]["key"]

# Dictionaries from CKAN ---------------------------------------------------------------------#
def get_all_dataset_dicts(url, limit=None, offset=None):
    """Returns a list of package dicts including resources

    @see http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.current_package_list_with_resources
    :param url: A live CKAN URL
    :param limit: The max number of dataset dicts returned, optional
    :param offset: The offset, optional
    """
    url = "{0}api/3/action/current_package_list_with_resources".format(url)
    parameters = {}
    if limit:
        parameters["limit"] = limit
    if offset:
        parameters["offset"] = offset
    datadict = urllib.quote(json.dumps(parameters))
    
    r = requests.get(url, data=datadict)
    return json.loads(r.content)["result"]


def get_dataset_dicts_for_group(group_name, url):
    """Returns a list of package dicts including resources for a group

    :param group_name: the group name as string
    :param rl: A live CKAN URL
    """
    r = requests.get("{0}api/3/action/group_show?id={1}".format(api_url, group_name))
    return json.loads(r.content)["result"]["packages"]

def get_dataset_dicts_for_tag(tag_name, url):
    """Returns a list of package dicts including resources for a tag

    :param tag_name: the tag name as string
    :param url: A live CKAN URL
    """
    r = requests.get("{0}api/3/action/tag_show?id={1}".format(url, tag_name))
    return json.loads(r.content)["result"]["packages"]


def get_tag_dict_from_tag_name(tag_name, url):
    """Returns a dict of the CKAN tag without the package dict.

    :param tag_name: the tag name as string
    :param url: A live CKAN URL
    """
    r = requests.get("{0}api/3/action/tag_show?id={1}".format(url, tag_name))
    tag_dict = json.loads(r.content)["result"]
    tag_dict.pop("packages", None)
    return tag_dict


# Manipulate one dataset dict ----------------------------------------------------------------#
def add_last_updated_fields(dataset_dict, url, api_key):
    """Updates a dataset dictionary and posts back to CKAN

    Here: add "extra" fields 
    "Last updated on" (set to date of last edit) and 
    "Last updated by" (set to author)
    only if "Last updated by" does not exist

    :param dataset_dict: The dataset as dict from CKAN
    :param url: A live CKAN URL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """
    update_url = "{0}api/3/action/package_update".format(url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    lub = "Last updated by"
    luo = "Last updated on"
    
    if lub not in [x["key"] for x in dt["extras"]]:
        dt["extras"] += [
            {u"key":lub, u"value":dt["author"]},
            {u"key":luo,u"value":dt["metadata_modified"]}
        ]
        datadict = urllib.quote(json.dumps(dt))
        
        r = requests.post(update_url, data=datadict, headers=headers)
        print("Updated: {0}".format(dataset_name))
        return True
    else:
        print("Unchanged: {0}".format(dataset_name))
        return False

    
def rename_extra_field(dataset_dict, old_name, new_name, url, api_key):
    """Update the name of one extra field from old_name to new_name.
    
    Usage:
    [rename_extra_field(d, "old name", "new name") for d in package_names]

    :param dataset_dict: The dataset as dict from CKAN
    :param old_name: the extra field you wish to change
    :param new_name: the new name for the changed extra field
    :param url: A live CKAN URL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """
    update_url = "{0}api/3/action/package_update".format(url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    
    if old_name in [x["key"] for x in dataset_dict["extras"]]:
        for x in dataset_dict["extras"]:
            if x["key"] == old_name:
                x["key"] = new_name
        datadict = urllib.quote(json.dumps(dataset_dict))
        r = requests.post(update_url, data=datadict, headers=headers)
        print("Updated: {0}".format(dataset_dict["name"]))
        return True
    else:
        print("Unchanged: {0}".format(dataset_dict["name"]))
        return False

def migrate_extra_field(dataset_dict, old_name, new_name, url, api_key):
    """Copy the content of extra field with key "old_name" to dataset_dict 
    field with key "new_name".
    
    Usage:
    [rename_extra_field(d, "old name", "new name") for d in package_names]

    :param dataset_dict: The dataset as dict from CKAN
    :param old_name: the extra field you wish to change
    :param new_name: the new name for the changed extra field
    :param url: A live CKAN URL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """
    update_url = "{0}api/3/action/package_update".format(url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    
    if old_name in [x["key"] for x in dataset_dict["extras"]]:
        for x in dataset_dict["extras"]:
            if x["key"] == old_name:
                dataset_dict[new_name] = x["value"]
        datadict = urllib.quote(json.dumps(dataset_dict))
        r = requests.post(update_url, data=datadict, headers=headers)
        print("Updated: {0}".format(dataset_dict["name"]))
        return True
    else:
        print("Unchanged: {0}".format(dataset_dict["name"]))
        return False
        
    
def add_tag_to_dataset(dataset_dict, tag_dict, url, api_key):
    """Adds a tag to a dataset.

    :param dataset_dict: The dataset as dict from CKAN
    :param tag_dict: The tag as dict from CKAN
    :param api_url: A live CKAN RL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """
    update_url = "{0}api/3/action/package_update".format(url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    
    if tag_dict["name"] not in [t["name"] for t in dataset_dict["tags"]]:
        dataset_dict["tags"].append(tag_dict)
        datadict = urllib.quote(json.dumps(dataset_dict))
        r = requests.post(update_url, data=datadict, headers=headers)
        print("Tag not found, adding tag to {0}".format(dataset_dict["name"]))
        return True
    else:
        print("Tag found, skipping {0}".format(dataset_dict["name"]))
        return False
        
def update_resource_url(datadict, url, api_key):
    """Updates the URL of a resource.
    
    The old and new url parts are hard-coded. Modify to your needs.

    :param dataset_dict: The dataset as dict from CKAN
    :param url: A live CKAN URL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """    
    update_url = "{0}api/3/action/package_update".format(url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    
    #[r["url"] for r in d["resources"]]
    needs_update = False
    for resource in datadict["resources"]:
        if resource["url"].startswith("http://marine-data"):
            needs_update = True
            print("Old url: {0}".format(resource["url"]))
            resource["url"] = resource["url"].replace(
                "marine-data", "internal-data")
            print("changed to: {0}".format(resource["url"]))
        if resource["url"].startswith("http://ckan-private"):
            needs_update = True
            print("Old url: {0}".format(resource["url"]))
            resource["url"] = resource["url"].replace(
                "ckan-private", "internal-data")
            print("changed to: {0}".format(resource["url"]))
    if needs_update:
        dd = urllib.quote(json.dumps(datadict))
        r = requests.post(update_url, data=dd, headers=headers)
        print("Updated: {0}".format(datadict["name"]))
    else:
        print("Unchanged: {0}".format(datadict["name"]))
    return True

def migrate_extra_to_custom_fields(dataset_dict, api_url, api_key):
    """Copy the content of some extra fields to some custom field.

    :param dataset_dict: The dataset as dict from CKAN
    :param api_url: A live CKAN API URL
    :param api_key: A privileged CKAN API key
    :return: True if dataset was updated, False if not
    """
    update_url = "{0}package_update".format(api_url)
    headers = {'Authorization': api_key, 
                'content-type': 'application/x-www-form-urlencoded'}
    data_changed = False
    
    if 'extras' in dataset_dict.keys():
        print("Migrating {0}".format(dataset_dict["name"]))
        for e in dataset_dict['extras']:
            if e['key'] == 'spatial':
                dataset_dict['spatial'] = e['value']
                print('....spatial extent')
                data_changed = True
            elif e['key'] == 'Last updated on':
                dataset_dict['last_updated_on'] = e['value']
                print('....last updated on')
                data_changed = True
            elif e['key'] == 'Last updated by' and dataset_dict['maintainer'] == '':
                dataset_dict['maintainer'] = e['value']
                print('....last updated by -> maintainer')
                data_changed = True
            elif e['key'] == 'Data Source':
                dataset_dict['citation'] = e['value']
                print('....citation')
                data_changed = True
        del dataset_dict["extras"]
    
    if data_changed:
        datadict = urllib.quote(json.dumps(dataset_dict))
        r = requests.post(update_url, data=datadict, headers=headers)
        print("Updated: {0}".format(dataset_dict["name"]))
        return True
    else:
        print("Unchanged: {0}".format(dataset_dict["name"]))
        return False

In [48]:
# Export names and some info about all datasets to CSV
url1 = "{0}api/3/action/current_package_list_with_resources?limit=350".format(url)
url2 = "{0}api/3/action/current_package_list_with_resources?limit=350&offset=351".format(url)
url3 = "{0}}api/3/action/current_package_list_with_resources?limit=350&offset=700".format(url)
r1 = requests.get(url1)
r2 = requests.get(url2)
r3 = requests.get(url3)
ds = json.loads(r1.content)["result"] + json.loads(r2.content)["result"] + json.loads(r3.content)["result"]
len(ds)

810

In [None]:
# Export names and some info about all datasets to CSV - TEST DATA
url1 = "http://test-data.dpaw.wa.gov.au/api/3/action/current_package_list_with_resources?limit=350"
url2 = "http://test-data.dpaw.wa.gov.au/api/3/action/current_package_list_with_resources?limit=350&offset=351"
url3 = "http://test-data.dpaw.wa.gov.au/api/3/action/current_package_list_with_resources?limit=350&offset=700"
r1 = requests.get(url1)
r2 = requests.get(url2)
r3 = requests.get(url3)
td = json.loads(r1.content)["result"] + json.loads(r2.content)["result"] + json.loads(r3.content)["result"]                
len(td)

In [50]:
api_url="{0}api/3/action/".format(url)

[migrate_extra_to_custom_fields(p, api_url, key) for p in ds]

Migrating warden-wetlands-systems-waterbirds-monitoring
....last updated on
....spatial extent
Updated: warden-wetlands-systems-waterbirds-monitoring
Migrating mpa-reports
....last updated on
....spatial extent
Updated: mpa-reports
Migrating number-of-vessels-utilising-the-barrow-island-port
....citation
....last updated on
Updated: number-of-vessels-utilising-the-barrow-island-port
Unchanged: gss-post-fire-fauna-trapping-and-microhabitat-surveys
Migrating example-dataset
....spatial extent
Updated: example-dataset
Migrating coral-recruitment-at-ningaloo-mpa
....citation
....last updated on
....spatial extent
Updated: coral-recruitment-at-ningaloo-mpa
Unchanged: data-demo
Migrating world-spatial-extent
....last updated on
....spatial extent
Updated: world-spatial-extent
Migrating water-quality-on-the-shoreline-at-the-shoalwater-islands-mp-orthophosphate
....citation
....last updated on
....spatial extent
Updated: water-quality-on-the-shoreline-at-the-shoalwater-islands-mp-orthophosphat

[True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 F

## Apply functions, update CKAN data

In [None]:
s = [[x["title"],
      ", ".join([tag["name"] for tag in x["tags"]]).replace("_"," ").title(),
      x["maintainer"],
      x["organization"]["title"],
      "{0}dataset/{1}".format(url, x["id"])
      ] for x in ds]

import csv
with open('datasets.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(s)

In [22]:
# Rename extra fields of datasets
#datasets = get_all_dataset_dicts()
#[rename_extra_field(d, "Format cleared for release by:", "Format cleared for release by", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "Format cleared for release on:", "Format cleared for release on", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "Content cleared for release by:", "Content cleared for release by", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "content_cleared_for_release_on", "Content cleared for release on", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "content_cleared_for_release_by", "Content cleared for release by", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "format_cleared_for_release_on", "Format cleared for release on", api_key=api_key) for d in datasets]
#[rename_extra_field(d, "format_cleared_for_release_by", "Format cleared for release by", api_key=api_key) for d in datasets]

# Add extra fields "Last updated on/by" to all datasets
#dataset_list_of_dicts = get_all_dataset_dicts(ckan_instance=ckan_instance, action_api=action_api)
#[add_last_updated_fields(d, api_key=api_key) for d in datasets]
    
# Add an existing tag to datasets of a group    
#datasets_mpa = get_dataset_dicts_for_group-("mpa-reporting")
#tag = get_tag_dict_from_tag_name("mpa_incomplete")
#[add_tag_to_dataset(d, tag, api_key=api_key) for d in datasets_mpa]
#[rename_extra_field(d, "Last updated on", "lats_updated_on", api_key=api_key) for d in ds]
#regional_profile_datasets = get_dataset_dicts_for_tag("regional_profile")
#avw01_datasets = get_dataset_dicts_for_tag("ibra_{0}".format("avw01"))
t = '<a class="btn btn-primary btn-xs" href="{0}">{1}</a>'
''.join([t.format(r["url"], r["name"]) 
         for r in a["resources"] 
         for a in 
         get_dataset_dicts_for_tag("ibra_{0}".format("avw01"))])

'<a class="btn btn-primary btn-xs" href="http://internal-data.dpaw.wa.gov.au/dataset/824e1cc2-a76f-457b-81dc-2c60e50c9a52/resource/b435f1b2-4d4b-494c-bc4d-badcac8ae849/download/Sub-Region-Profile-Reporting-Tables-AVW01.pdf">Regional Profile IBRA Subregion AVW01</a>'

In [5]:
# Add a new tag to datasets with an old tag - both tags must exist
ds = get_dataset_dicts_for_tag("author_rachael_middlebrook")
new_tag = get_tag_dict_from_tag_name("author_rachael_marshall")
[add_tag_to_dataset(d, new_tag, url, key) for d in ds]

Tag found, skipping number-of-storms-at-montebello-and-barrow-islands-mpa
Tag found, skipping department-of-fisheries-patrols-and-other-activities-at-the-montebello-and-barrow-islands
Tag found, skipping operational-funds-for-the-montebello-and-barrow-islands-mp-turtles
Tag found, skipping number-of-tourists-visiting-montebello-and-barrow-islands-with-commercial-tour-operators
Tag found, skipping operational-funds-for-the-montebello-and-barrow-islands-mp-mangrove-communities
Tag found, skipping number-of-vessels-utilising-the-barrow-island-port
Tag found, skipping number-of-patrols-undertaken-at-the-montebello-and-barrow-islands
Tag found, skipping district-staff-days-attributed-to-oil-spill-response-training-for-the-montebello-and-barrow-islands
Tag found, skipping operational-funds-for-the-montebello-and-barrow-islands-mp-water-quality
Tag found, skipping operational-funds-for-the-montebello-and-barrow-islands-mp-finfish-communities
Tag found, skipping operational-funds-for-the-monte

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]